Ejemplos de SubwordTextEncoder en Python, ejemplos de text_encoder.SubwordTextEncoder en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: subword_builder.py Proyecto: maggieezzat/bert-vocab-builder

def main(unused_argv):
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            FLAGS.do_lower,
            split_on_newlines=FLAGS.split_on_newlines)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.do_lower,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    encoder = text_encoder.SubwordTextEncoder()
    encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                    FLAGS.num_iterations)
    encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: sse_index.py Proyecto: yuntaow/Sequence-Semantic-Embedding

def index(model_dir, rawfile, encodeIndexFile, batchsize=10000):
    if not os.path.exists(model_dir):
        print('Error! Model folder does not exist!! : %s' % model_dir)
        exit(-1)

    if not os.path.exists(os.path.join(model_dir, 'vocabulary.txt')):
        print(
            'Error!! Could not find vocabulary file for encoder in folder :%s'
            % model_dir)
        exit(-1)

    encoder = text_encoder.SubwordTextEncoder(
        filename=os.path.join(model_dir, 'vocabulary.txt'))
    print("Loaded  vocab size is: %d" % encoder.vocab_size)

    cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    with tf.Session(config=cfg) as sess:
        #load model
        modelConfigs = data_utils.load_model_configs(model_dir)
        model = sse_model.SSEModel(modelConfigs)
        ckpt = tf.train.get_checkpoint_state(model_dir)
        if ckpt:
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print(
                'Error!!!Could not load any model from specified folder: %s' %
                model_dir)
            exit(-1)

        # start to indexing
        createIndexFile(model, encoder, rawfile,
                        int(modelConfigs['max_seq_length']), encodeIndexFile,
                        sess, batchsize)

Ejemplo n.º 3

0

Mostrar archivo

def index(model_dir, rawfile, encodeIndexFile, batchsize=10000):
  if not os.path.exists( model_dir ):
    print('Error! Model folder does not exist!! : %s' % model_dir)
    exit(-1)

  if not os.path.exists( os.path.join(model_dir, 'vocabulary.txt' ) ):
    print('Error!! Could not find vocabulary file for encoder in folder :%s' % model_dir)
    exit(-1)

  encoder = text_encoder.SubwordTextEncoder(filename=os.path.join(model_dir, 'vocabulary.txt' ))
  print("Loaded  vocab size is: %d" % encoder.vocab_size)

  cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
  with tf.Session(config=cfg) as sess:
    #load model
    modelConfigs = data_utils.load_model_configs(model_dir)
    model = sse_model.SSEModel( int(modelConfigs['max_seq_length']), float(modelConfigs['max_gradient_norm']), int(modelConfigs['vocabsize']),
                               int(modelConfigs['embedding_size']), int(modelConfigs['encoding_size']),
                               int(modelConfigs['src_cell_size']), int(modelConfigs['tgt_cell_size']), int(modelConfigs['num_layers']),
                               float(modelConfigs['learning_rate']), float(modelConfigs['learning_rate_decay_factor']), int(modelConfigs['targetSpaceSize']), network_mode=modelConfigs['network_mode'], forward_only=True, TOP_N=int(modelConfigs['TOP_N']) )
    ckpt = tf.train.get_checkpoint_state(model_dir)
    if ckpt:
      print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
      model.saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        print('Error!!!Could not load any model from specified folder: %s' % model_dir)
        exit(-1)

    # start to indexing
    createIndexFile(model, encoder, rawfile, int(modelConfigs['max_seq_length']), encodeIndexFile, sess, batchsize)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: subword_builder.py Proyecto: umakot1974/bert-vocab-builder

def main(unused_argv):
    if FLAGS.log_level not in ['DEBUG', 'INFO', 'ERROR']:
        raise ValueError('Set verbosity among "DEBUG", "INFO", "ERROR"')
    tf.logging.set_verbosity(FLAGS.log_level)
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            split_on_newlines=FLAGS.split_on_newlines,
            additional_chars=FLAGS.additional_chars)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    encoder = text_encoder.SubwordTextEncoder()
    encoder.build_from_token_counts(
        token_counts,
        FLAGS.min_count,
        FLAGS.num_iterations,
        max_subtoken_length=FLAGS.max_subtoken_length,
        backward=FLAGS.backward)
    encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: data_utils.py Proyecto: tajain07/Sequence-Semantic-Embedding

def load_encodedTargetSpace(processed_data_dir):
    """

  :param processed_data_dir:
  :return:
  """
    vocabFile = processed_data_dir + '/vocabulary.txt'
    if gfile.Exists(vocabFile):
        encoder = text_encoder.SubwordTextEncoder(filename=vocabFile)
        print("Loaded  vocab size is: %d" % encoder.vocab_size)
    else:
        raise ValueError(
            "Error!! Could not found vaculary file in model folder.")
    encodedTgtSpace = {}
    tgtID_Name_Map = {}
    tgtEncodeFile = os.path.join(processed_data_dir, "encoded.FullTargetSpace")
    if not gfile.Exists(tgtEncodeFile):
        raise ValueError(
            "Error! could not found encoded.FullTargetSpace in model folder.")
    print("Loading full target space index ...")
    for line in codecs.open(tgtEncodeFile, 'r', 'utf-8'):
        tgtId, tgtName, tgtEncoding = line.strip().split('\t')
        tgtID_Name_Map[tgtId] = tgtName
        encodedTgtSpace[tgtId] = [int(i) for i in tgtEncoding.split(',')]
    return encoder, encodedTgtSpace, tgtID_Name_Map

Ejemplo n.º 6

0

Mostrar archivo

Archivo: data.py Proyecto: yandwang/Sequence-Semantic-Embedding

 def __init__(self, work_dir, rawdata_dir, rawvocabsize, max_seq_length):
     json_path = work_dir + '/compressed'
     if os.path.exists(json_path):
         # load data from json
         print('loading saved json data from %s' % json_path)
         with open(json_path, 'r') as fin:
             gdict = json.load(fin)
             for name, val in gdict.items():
                 setattr(self, name, val)
         # setup encoder from vocabulary file
         vocabFile = work_dir + '/vocabulary.txt'
         if os.path.exists(vocabFile):
             print("Loading supplied vocabluary file: %s" % vocabFile)
             encoder = text_encoder.SubwordTextEncoder(filename=vocabFile)
             print("Total vocab size is: %d" % encoder.vocab_size)
         else:
             print(
                 "No supplied vocabulary file found. Build new vocabulary based on training data ...."
             )
             token_counts = tokenizer.corpus_token_counts(
                 work_dir + '/*.Corpus', 2000000, split_on_newlines=True)
             encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
                 rawvocabsize, token_counts, 2, 1000)
             encoder.store_to_file(vocabFile)
             print("New vocabulary constructed.")
         self.encoder = encoder
         self.max_seq_length = int(self.max_seq_length)
         self.vocab_size = encoder.vocab_size
         print('-')
         print('Vocab size:', self.vocab_size, 'unique words')
         print('-')
         print('Max allowed sequence length:', self.max_seq_length)
         print('-')
     else:
         print('generating data from data path: %s' % rawdata_dir)
         encoder, trainCorpus, evalCorpus, encodedFullTargetSpace, tgtIdNameMap = data_utils.prepare_raw_data(
             rawdata_dir, work_dir, rawvocabsize, max_seq_length)
         self.encoder = encoder
         self.rawTrainPosCorpus = trainCorpus
         self.rawEvalCorpus = evalCorpus
         self.max_seq_length = max_seq_length
         self.encodedFullTargetSpace = encodedFullTargetSpace
         self.tgtIdNameMap = tgtIdNameMap
         self.vocab_size = encoder.vocab_size
         self.fullSetTargetIds = list(encodedFullTargetSpace.keys())
         self.rawnegSetLen = len(self.fullSetTargetIds)
         print('-')
         print('Vocab size:', self.vocab_size, 'unique words')
         print('-')
         print('Max allowed sequence length:', self.max_seq_length)
         print('-')
         gdict = {}
         for name, attr in self.__dict__.items():
             if not name.startswith("__") and name != 'encoder':
                 if not callable(attr) and not type(attr) is staticmethod:
                     gdict[name] = attr
         with open(json_path, 'w') as fout:
             json.dump(gdict, fout)
         print('Processed data dumped')

Ejemplo n.º 7

0

Mostrar archivo

  def __init__(self, *args, **kwargs):
    super(FlaskApp, self).__init__(*args, **kwargs)

    self.model = 'Do my initialization work here, loading model and index ....'
    self.model_type = os.environ.get("MODEL_TYPE", "classification")
    self.model_dir = "models-" + self.model_type
    self.indexFile = os.environ.get("INDEX_FILE", "targetEncodingIndex.tsv")
    print("In app class: Received flask appconfig is: " + os.environ.get('MODEL_TYPE', 'Default_classification') )

    if not os.path.exists(self.model_dir):
      print('Model folder %s does not exist!!' % self.model_dir )
      exit(-1)

    if not os.path.exists(os.path.join(self.model_dir, self.indexFile)):
      print('Index File does not exist!!')
      exit(-1)

    # load full set targetSeqID data
    if not os.path.exists(os.path.join(self.model_dir, 'vocabulary.txt')):
        print('Error!! Could not find vocabulary file for encoder in model folder.')
        exit(-1)
    self.encoder = text_encoder.SubwordTextEncoder(filename=os.path.join(self.model_dir, 'vocabulary.txt'))

    # load full set target Index data
    self.targetEncodings = []
    self.targetIDs = []
    self.targetIDNameMap = {}
    idx = 0
    for line in codecs.open(os.path.join(self.model_dir, self.indexFile), 'r', 'utf-8').readlines():
        info = line.strip().split('\t')
        if len(info) != 3:
            print('Error in targetIndexFile! %s' % line)
            continue
        tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2]
        self.targetIDs.append(tgtid)
        self.targetEncodings.append([float(f) for f in tgtEncoding.strip().split(',')])
        self.targetIDNameMap[tgtid] = tgtseq
        idx += 1
    self.targetEncodings = np.array(self.targetEncodings)

    cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    self.sess = tf.Session(config=cfg)
    #load model
    self.modelConfigs = data_utils.load_model_configs(self.model_dir)
    self.model = sse_model.SSEModel( int(self.modelConfigs['max_seq_length']), float(self.modelConfigs['max_gradient_norm']),
                                     int(self.modelConfigs['vocabsize']),
                               int(self.modelConfigs['embedding_size']), int(self.modelConfigs['encoding_size']),
                               int(self.modelConfigs['src_cell_size']), int(self.modelConfigs['tgt_cell_size']), int(self.modelConfigs['num_layers']),
                               float(self.modelConfigs['learning_rate']), float(self.modelConfigs['learning_rate_decay_factor']),
                                     int(self.modelConfigs['targetSpaceSize']), network_mode=self.modelConfigs['network_mode'],
                                     forward_only=True, TOP_N=int(self.modelConfigs['TOP_N']) )
    ckpt = tf.train.get_checkpoint_state(self.model_dir)
    if ckpt:
      print("loading model from %s" % ckpt.model_checkpoint_path)
      self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
    else:
        print('Error!!!Could not load any model from specified folder: %s' % self.model_dir)
        exit(-1)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: data_utils.py Proyecto: xrick/Sequence-Semantic-Embedding

def prepare_raw_data(raw_data_dir, processed_data_dir, vocabulary_size, neg_samples, max_seq_length):
  """
  Get SSE training, and Evaluation related data, create tokenizer and vocabulary.

  :param raw_data_dir:
  :param processed_data_dir:
  :param vocabulary_size:
  :param neg_samples:
  :param max_seq_length:
  :return:
  """
  # unzip corpus to the specified processed directory.
  get_data_set(raw_data_dir, processed_data_dir)

  # generate vocab file if not available, otherwise, use supplied vocab file for encoder
  vocabFile = processed_data_dir + '/vocabulary.txt'
  if gfile.Exists(vocabFile):
    print("Loading supplied vocabluary file: %s" % vocabFile)
    encoder = text_encoder.SubwordTextEncoder(filename=vocabFile)
    print("Total vocab size is: %d" % encoder.vocab_size)
  else:
    print("No supplied vocabulary file found. Build new vocabulary based on training data ....")
    token_counts = tokenizer.corpus_token_counts(processed_data_dir + '/*.Corpus', 1000000, split_on_newlines=True)
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(vocabulary_size, token_counts, 2, 1000)
    encoder.store_to_file(vocabFile)
    print("New vocabulary constructed.")

  # create encoded TargetSpace Data
  encodedFullTargetSpace = {}
  tgtIdNameMap = {}
  encodedFullTargetFile = codecs.open(os.path.join(processed_data_dir, "encoded.FullTargetSpace"), 'w', 'utf-8')
  for line in codecs.open(os.path.join(processed_data_dir, "targetIDs"), 'r', 'utf-8'):
    tgtSeq, id = line.strip().split('\t')
    token_ids = encoder.encode(tgtSeq.lower())
    seqlen = len(token_ids)
    if seqlen > max_seq_length - 1:
      print(
        'Error Detected!!! \n Target:\n %s \n Its seq length is:%d,  which is longer than MAX_SEQ_LENTH of %d. Try to increase limit!!!!' % (
        tgtSeq, seqlen, max_seq_length))
      continue
    token_ids = token_ids + [text_encoder.EOS_ID] + [text_encoder.PAD_ID] * (max_seq_length - seqlen - 1)
    encodedFullTargetSpace[id] = token_ids
    tgtIdNameMap[id] = tgtSeq
    encodedFullTargetFile.write(id + '\t' + tgtSeq.strip() + '\t' + ','.join([str(i) for i in token_ids]) + '\n')
  encodedFullTargetFile.close()

  # creat positive Evaluation corpus: (source_tokens, verifiedTgtIds )
  evalCorpus = gen_postive_corpus(os.path.join(processed_data_dir, "EvalPairs"), encodedFullTargetSpace, encoder,
                                  max_seq_length)

  # create positive Training Corpus: (source_tokens, verifiedTgtIds )
  trainCorpus = gen_postive_corpus(os.path.join(processed_data_dir, "TrainPairs"), encodedFullTargetSpace,
                                         encoder, max_seq_length)
  return encoder, trainCorpus, evalCorpus, encodedFullTargetSpace, tgtIdNameMap

Ejemplo n.º 9

0

Mostrar archivo

def prepare_raw_data(raw_data_dir, processed_data_dir, vocabulary_size, task_type,  max_seq_length):
  """Get SSE training-Evaluation data into data_dir, create vocabularies and tokenized data.

  Args:
    raw_data_dir:  directory contains the raw zipped dataset.
    processed_data_dir: directory in which the processed data sets will be stored.
    vocabulary_size: size of the vocabulary to create and use if no vocabulary file found in rawdata. Otherwise, use supplied vocabulary file.
    task_type: different task_type has slightly different rawdata format, and need different treatment
               for classification task, usually has TrainPairs, EvalPairs, targetSpaceID file
               for search task,
               for cross-lingual search tasks,
               for question answer tasks,
    max_seq_length: max number of tokens  of a single source/target sequence
  Returns:
    A tuple of 5 elements:
      (1) path to encoded TrainPairs: targetID, Sequence of source token IDs
      (2) path to encoded EvalPairs: targetID, Sequence of source token IDs
      (3) path to encoded full TargetSpaces: targetID, Sequence of target token IDs
      (4) path to the source vocabulary file,
      (5) path to the target vocabulary file.
  """
  # extract corpus to the specified processed directory.
  get_data_set(raw_data_dir, processed_data_dir)

  # generate vocab file if not available, otherwise, use supplied vocab file for encoder
  vocabFile = processed_data_dir + '/vocabulary.txt'
  if  gfile.Exists( vocabFile ):
    print("Loading supplied vocabluary file: %s" % vocabFile )
    encoder = text_encoder.SubwordTextEncoder(filename=vocabFile)
    print("Total vocab size is: %d" % encoder.vocab_size )
  else:
    print("No supplied vocabulary file found. Build new vocabulary based on training data ....")
    token_counts = tokenizer.corpus_token_counts( processed_data_dir + '/*.Corpus', 1000000, split_on_newlines=True)
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size( vocabulary_size, token_counts, 2, 1000 )
    encoder.store_to_file(vocabFile)
    print("New vocabulary constructed.")

  # create training corpus and evaluation corpus per task_type
  if task_type.lower().strip() == "classification":
    train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_classification_corpus( processed_data_dir, encoder, max_seq_length)
  elif task_type.lower().strip() in ["ranking", "crosslingual" ]:
    train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_search_corpus( processed_data_dir, encoder,  max_seq_length)
  elif task_type.lower().strip()  == "qna":
    train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_questionAnswer_corpus(processed_data_dir, encoder, max_seq_length)
  else:
    raise ValueError("Unsupported task_type. Please use one of: classification, search, crosslanguages, questionanswer")

  return encoder, train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap

Ejemplo n.º 10

0

Mostrar archivo

Archivo: generator_utils.py Proyecto: haznai/font_ai

def get_or_generate_vocab_inner(
    data_dir,
    vocab_filename,
    vocab_size,
    generator,
    max_subtoken_length=None,
    reserved_tokens=None,
):
    """Inner implementation for vocab generators.
    Args:
      data_dir: The base directory where data and vocab files are stored. If None,
        then do not save the vocab even if it doesn't exist.
      vocab_filename: relative filename where vocab file is stored
      vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
      generator: a generator that produces tokens from the vocabulary
      max_subtoken_length: an optional integer.  Set this to a finite value to
        avoid quadratic costs during vocab building.
      reserved_tokens: List of reserved tokens. `text_encoder.RESERVED_TOKENS`
        should be a prefix of `reserved_tokens`. If `None`, defaults to
        `RESERVED_TOKENS`.
    Returns:
      A SubwordTextEncoder vocabulary object.
    """
    if data_dir and vocab_filename:
        vocab_filepath = os.path.join(data_dir, vocab_filename)
        if tf.gfile.Exists(vocab_filepath):
            tf.logging.info("Found vocab file: %s", vocab_filepath)
            return text_encoder.SubwordTextEncoder(vocab_filepath)
    else:
        vocab_filepath = None

    tf.logging.info("Generating vocab file: %s", vocab_filepath)
    vocab = text_encoder.SubwordTextEncoder.build_from_generator(
        generator,
        vocab_size,
        max_subtoken_length=max_subtoken_length,
        reserved_tokens=reserved_tokens,
    )

    if vocab_filepath:
        tf.gfile.MakeDirs(data_dir)
        vocab.store_to_file(vocab_filepath)

    return vocab

Ejemplo n.º 11

0

Mostrar archivo

Archivo: webserver.py Proyecto: colinsongf/document_retriever_for_qa

    def __init__(self, *args, **kwargs):
        super(FlaskApp, self).__init__(*args, **kwargs)

        self.model = 'Do my initialization work here, loading model and index ....'
        self.model_type = os.environ.get("MODEL_TYPE", "classification")
        self.model_dir = "models-" + self.model_type
        self.indexFile = os.environ.get("INDEX_FILE",
                                        "targetEncodingIndex.tsv")

        if not os.path.exists("./logs"):
            os.makedirs("./logs", exist_ok=True)
        log = logging.getLogger('')
        log.setLevel(logging.DEBUG)
        format = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
            datefmt='%m/%d/%Y %I:%M:%S %p')
        ch = logging.StreamHandler(sys.stdout)
        ch.setFormatter(format)
        log.addHandler(ch)
        fh = handlers.RotatingFileHandler('./logs/WebServerLog.txt',
                                          maxBytes=(1048576 * 20),
                                          backupCount=7)
        fh.setFormatter(format)
        log.addHandler(fh)

        logging.info("In app class: Received flask appconfig is: " +
                     os.environ.get('MODEL_TYPE', 'Default_classification'))

        if not os.path.exists(self.model_dir):
            logging.error('Model folder %s does not exist!!' % self.model_dir)
            exit(-1)

        if not os.path.exists(os.path.join(self.model_dir, self.indexFile)):
            logging.error('Index File does not exist!!')
            exit(-1)

        # load full set targetSeqID data
        if not os.path.exists(os.path.join(self.model_dir, 'vocabulary.txt')):
            logging.error(
                'Error!! Could not find vocabulary file for encoder in model folder.'
            )
            exit(-1)
        self.encoder = text_encoder.SubwordTextEncoder(
            filename=os.path.join(self.model_dir, 'vocabulary.txt'))

        # load full set target Index data
        self.targetEncodings = []
        self.targetIDs = []
        self.targetIDNameMap = {}
        idx = 0
        for line in codecs.open(os.path.join(self.model_dir, self.indexFile),
                                'r', 'utf-8').readlines():
            info = line.strip().split('\t')
            if len(info) != 3:
                logging.info('Error in targetIndexFile! %s' % line)
                continue
            tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2]
            self.targetIDs.append(tgtid)
            self.targetEncodings.append(
                [float(f) for f in tgtEncoding.strip().split(',')])
            self.targetIDNameMap[tgtid] = tgtseq
            idx += 1
        self.targetEncodings = np.array(self.targetEncodings)

        cfg = tf.ConfigProto(log_device_placement=False,
                             allow_soft_placement=True)
        self.sess = tf.Session(config=cfg)
        #load model
        self.modelConfigs = data_utils.load_model_configs(self.model_dir)
        self.model = sse_model.SSEModel(self.modelConfigs)
        ckpt = tf.train.get_checkpoint_state(self.model_dir)
        if ckpt:
            logging.info("loading model from %s" % ckpt.model_checkpoint_path)
            self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            logging.error(
                'Error!!!Could not load any model from specified folder: %s' %
                self.model_dir)
            exit(-1)

Ejemplo n.º 12

0

Mostrar archivo

def demo(nbest):
    if not os.path.exists(FLAGS.model_dir):
        print('Model folder does not exist!!')
        exit(-1)

    if not os.path.exists(os.path.join(FLAGS.model_dir, 'vocabulary.txt')):
        print(
            'Error!! Could not find vocabulary file for encoder in model folder.'
        )
        exit(-1)
    encoder = text_encoder.SubwordTextEncoder(
        filename=os.path.join(FLAGS.model_dir, 'vocabulary.txt'))

    if not os.path.exists(os.path.join(FLAGS.model_dir, FLAGS.indexFile)):
        print('Index file does not exist!!!')
        exit(-1)

    #load full set target Index data
    targetEncodings = []
    targetIDs = []
    idLabelMap = {}
    targetIDNameMap = {}
    idx = 0
    for line in codecs.open(os.path.join(FLAGS.model_dir, FLAGS.indexFile),
                            'rt', 'utf-8').readlines():
        info = line.strip().split('\t')
        if len(info) != 3:
            print('Error in targetIndexFile! %s' % line)
            continue
        tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2]
        targetIDs.append(tgtid)
        targetEncodings.append(
            [float(f) for f in tgtEncoding.strip().split(',')])
        idLabelMap[tgtid] = idx
        targetIDNameMap[tgtid] = tgtseq
        idx += 1
    targetEncodings = np.array(targetEncodings)

    cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    with tf.Session(config=cfg) as sess:
        # TODO: improve here later
        #load model
        modelConfigs = data_utils.load_model_configs(FLAGS.model_dir)
        model = sse_model.SSEModel(modelConfigs)
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        if ckpt:
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print(
                'Error!!!Could not load any model from specified folder: %s' %
                FLAGS.model_dir)
            exit(-1)

        # Decode from standard input.
        sys.stdout.write(
            "\n\nPlease type some keywords to get related task results.\nType 'exit' to quit demo.\n > "
        )
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence and sentence.strip().lower() != 'exit':
            # Get token-ids for the input sentence.
            source_tokens = encoder.encode(tf.compat.as_str(sentence).lower())
            srclen = len(source_tokens)
            max_seq_length = int(modelConfigs['max_seq_length'])
            if srclen > max_seq_length - 2:
                print(
                    'Input sentence too long, max allowed is %d. Try to increase limit!!!!'
                    % (max_seq_length))
                source_tokens = [
                    text_encoder.PAD_ID
                ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID]
            else:
                source_tokens = [text_encoder.PAD_ID] * (
                    max_seq_length - srclen - 1) + source_tokens + [
                        text_encoder.EOS_ID
                    ]

            feed_dict = model.get_source_encoding_feed_dict(
                np.array([source_tokens]))
            model.set_forward_only(True)
            sourceEncodings = sess.run([model.src_seq_embedding],
                                       feed_dict=feed_dict)
            #sourceEncodings = sess.run([model.norm_src_seq_embedding], feed_dict=feed_dict)
            sourceEncodings = np.vstack(sourceEncodings)
            distances = np.dot(sourceEncodings, targetEncodings.T)
            rankedScore, rankedIdx = data_utils.getSortedResults(distances)
            top_confs = rankedScore[0][:nbest]
            top_tgtIDs = [targetIDs[lbl] for lbl in rankedIdx[0][:nbest]]
            top_tgtNames = [targetIDNameMap[id] for id in top_tgtIDs]

            print('Top %s Prediction results are:\n' % nbest)
            for idx in range(nbest):
                print('top%d:  %s , %f ,  %s ' %
                      (idx + 1, top_tgtIDs[idx], top_confs[idx],
                       top_tgtNames[idx]))
            print("> ", end="")

            sys.stdout.flush()
            sentence = sys.stdin.readline()

Ejemplo n.º 13

0

Mostrar archivo

state.direction_choice = st.selectbox('Direction', directions)


@st.cache(allow_output_mutation=True)
def init(direction_choice):
    if state.direction_choice == "English to Vietnamese":
        return (
            get_resource('envi_pure_tall9'),
            'Welcome to the best ever translation project for Vietnamese !')
    else:
        return (get_resource('vien_pure_tall9'),
                'Chào mừng bạn đến với dự án dịch tiếng Việt tốt nhất !')


state.encoder = text_encoder.SubwordTextEncoder(vocab_file)

with open(vocab_file, 'r') as f:
    state.vocab = f.read().split('\n')

(state.model, state.model_path), state.prompt = init(state.direction_choice)

if state.direction_choice != state.prev_choice and state.prev_choice != None:
    state.like = False
    state.submit = False
    state.first_time = True

state.prev_choice = state.direction_choice

write_ui()