def main(unused_argv):
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            FLAGS.do_lower,
            split_on_newlines=FLAGS.split_on_newlines)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.do_lower,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    encoder = text_encoder.SubwordTextEncoder()
    encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                    FLAGS.num_iterations)
    encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
def main(unused_argv):
    if FLAGS.log_level not in ['DEBUG', 'INFO', 'ERROR']:
        raise ValueError('Set verbosity among "DEBUG", "INFO", "ERROR"')
    tf.logging.set_verbosity(FLAGS.log_level)
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            split_on_newlines=FLAGS.split_on_newlines,
            additional_chars=FLAGS.additional_chars)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    encoder = text_encoder.SubwordTextEncoder()
    encoder.build_from_token_counts(
        token_counts,
        FLAGS.min_count,
        FLAGS.num_iterations,
        max_subtoken_length=FLAGS.max_subtoken_length,
        backward=FLAGS.backward)
    encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
def main(unused_argv):
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            split_on_newlines=FLAGS.split_on_newlines)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    # encoder = text_encoder.SubwordTextEncoder()
    # encoder.build_from_token_counts(token_counts, FLAGS.min_count,
    #                                 FLAGS.num_iterations)

    # encoder.build_to_target_size(32000 , token_counts, FLAGS.min_count, 5 * FLAGS.min_count)
    # encoder.build_to_target_size(32000, token_counts, 2, 10000)

    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        32000, token_counts, 1, 1000)

    encoder.store_to_file(FLAGS.output_filename)
Ejemplo n.º 4
0
 def __init__(self, work_dir, rawdata_dir, rawvocabsize, max_seq_length):
     json_path = work_dir + '/compressed'
     if os.path.exists(json_path):
         # load data from json
         print('loading saved json data from %s' % json_path)
         with open(json_path, 'r') as fin:
             gdict = json.load(fin)
             for name, val in gdict.items():
                 setattr(self, name, val)
         # setup encoder from vocabulary file
         vocabFile = work_dir + '/vocabulary.txt'
         if os.path.exists(vocabFile):
             print("Loading supplied vocabluary file: %s" % vocabFile)
             encoder = text_encoder.SubwordTextEncoder(filename=vocabFile)
             print("Total vocab size is: %d" % encoder.vocab_size)
         else:
             print(
                 "No supplied vocabulary file found. Build new vocabulary based on training data ...."
             )
             token_counts = tokenizer.corpus_token_counts(
                 work_dir + '/*.Corpus', 2000000, split_on_newlines=True)
             encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
                 rawvocabsize, token_counts, 2, 1000)
             encoder.store_to_file(vocabFile)
             print("New vocabulary constructed.")
         self.encoder = encoder
         self.max_seq_length = int(self.max_seq_length)
         self.vocab_size = encoder.vocab_size
         print('-')
         print('Vocab size:', self.vocab_size, 'unique words')
         print('-')
         print('Max allowed sequence length:', self.max_seq_length)
         print('-')
     else:
         print('generating data from data path: %s' % rawdata_dir)
         encoder, trainCorpus, evalCorpus, encodedFullTargetSpace, tgtIdNameMap = data_utils.prepare_raw_data(
             rawdata_dir, work_dir, rawvocabsize, max_seq_length)
         self.encoder = encoder
         self.rawTrainPosCorpus = trainCorpus
         self.rawEvalCorpus = evalCorpus
         self.max_seq_length = max_seq_length
         self.encodedFullTargetSpace = encodedFullTargetSpace
         self.tgtIdNameMap = tgtIdNameMap
         self.vocab_size = encoder.vocab_size
         self.fullSetTargetIds = list(encodedFullTargetSpace.keys())
         self.rawnegSetLen = len(self.fullSetTargetIds)
         print('-')
         print('Vocab size:', self.vocab_size, 'unique words')
         print('-')
         print('Max allowed sequence length:', self.max_seq_length)
         print('-')
         gdict = {}
         for name, attr in self.__dict__.items():
             if not name.startswith("__") and name != 'encoder':
                 if not callable(attr) and not type(attr) is staticmethod:
                     gdict[name] = attr
         with open(json_path, 'w') as fout:
             json.dump(gdict, fout)
         print('Processed data dumped')
Ejemplo n.º 5
0
def main(unused_argv):
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            split_on_newlines=FLAGS.split_on_newlines,
            additional_chars=FLAGS.additional_chars,
            do_lower_case=FLAGS.do_lower_case)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.corpus_max_lines,
                                                    FLAGS.do_lower_case)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')
    reserved_tokens = None
    if FLAGS.raw_vocab:
        lines = open(FLAGS.raw_vocab, 'r', encoding='utf-8').readlines()
        lines = [s.strip() for s in lines if len(s) > 0]
        reserved_tokens = lines

    print(len(token_counts))
    print(len(reserved_tokens))
    target_size = FLAGS.vocab_size
    if target_size <= len(reserved_tokens):
        raise ValueError(
            "The vocab_size must be larger than the origin vocab's size ")
    if target_size >= len(token_counts):
        raise ValueError(
            "The vocab_size is too large. Please set it smaller or prepare more corpus."
        )
    min_val = 1
    max_val = len(token_counts) // (target_size**0.5)
    fd, temp_path = tempfile.mkstemp()
    encoder = SubwordTextEncoder.build_to_target_size(
        target_size,
        token_counts,
        min_val,
        max_val,
        num_iterations=FLAGS.num_iterations,
        reserved_tokens=reserved_tokens,
        max_subtoken_length=FLAGS.max_subtoken_length)
    # encoder = SubwordTextEncoder()
    # encoder.build_from_token_counts(token_counts, FLAGS.min_count,
    #                                 FLAGS.num_iterations, reserved_tokens=reserved_tokens, max_subtoken_length=FLAGS.max_subtoken_length)
    encoder.store_to_file(temp_path, add_single_quotes=False)
    merge_output_file_with_bert_vocab(FLAGS.output_filename, FLAGS.raw_vocab,
                                      temp_path)
def prepare_raw_data(raw_data_dir, processed_data_dir, vocabulary_size, neg_samples, max_seq_length):
  """
  Get SSE training, and Evaluation related data, create tokenizer and vocabulary.

  :param raw_data_dir:
  :param processed_data_dir:
  :param vocabulary_size:
  :param neg_samples:
  :param max_seq_length:
  :return:
  """
  # unzip corpus to the specified processed directory.
  get_data_set(raw_data_dir, processed_data_dir)

  # generate vocab file if not available, otherwise, use supplied vocab file for encoder
  vocabFile = processed_data_dir + '/vocabulary.txt'
  if gfile.Exists(vocabFile):
    print("Loading supplied vocabluary file: %s" % vocabFile)
    encoder = text_encoder.SubwordTextEncoder(filename=vocabFile)
    print("Total vocab size is: %d" % encoder.vocab_size)
  else:
    print("No supplied vocabulary file found. Build new vocabulary based on training data ....")
    token_counts = tokenizer.corpus_token_counts(processed_data_dir + '/*.Corpus', 1000000, split_on_newlines=True)
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(vocabulary_size, token_counts, 2, 1000)
    encoder.store_to_file(vocabFile)
    print("New vocabulary constructed.")

  # create encoded TargetSpace Data
  encodedFullTargetSpace = {}
  tgtIdNameMap = {}
  encodedFullTargetFile = codecs.open(os.path.join(processed_data_dir, "encoded.FullTargetSpace"), 'w', 'utf-8')
  for line in codecs.open(os.path.join(processed_data_dir, "targetIDs"), 'r', 'utf-8'):
    tgtSeq, id = line.strip().split('\t')
    token_ids = encoder.encode(tgtSeq.lower())
    seqlen = len(token_ids)
    if seqlen > max_seq_length - 1:
      print(
        'Error Detected!!! \n Target:\n %s \n Its seq length is:%d,  which is longer than MAX_SEQ_LENTH of %d. Try to increase limit!!!!' % (
        tgtSeq, seqlen, max_seq_length))
      continue
    token_ids = token_ids + [text_encoder.EOS_ID] + [text_encoder.PAD_ID] * (max_seq_length - seqlen - 1)
    encodedFullTargetSpace[id] = token_ids
    tgtIdNameMap[id] = tgtSeq
    encodedFullTargetFile.write(id + '\t' + tgtSeq.strip() + '\t' + ','.join([str(i) for i in token_ids]) + '\n')
  encodedFullTargetFile.close()

  # creat positive Evaluation corpus: (source_tokens, verifiedTgtIds )
  evalCorpus = gen_postive_corpus(os.path.join(processed_data_dir, "EvalPairs"), encodedFullTargetSpace, encoder,
                                  max_seq_length)

  # create positive Training Corpus: (source_tokens, verifiedTgtIds )
  trainCorpus = gen_postive_corpus(os.path.join(processed_data_dir, "TrainPairs"), encodedFullTargetSpace,
                                         encoder, max_seq_length)
  return encoder, trainCorpus, evalCorpus, encodedFullTargetSpace, tgtIdNameMap
Ejemplo n.º 7
0
def prepare_raw_data(raw_data_dir, processed_data_dir, vocabulary_size, task_type,  max_seq_length):
  """Get SSE training-Evaluation data into data_dir, create vocabularies and tokenized data.

  Args:
    raw_data_dir:  directory contains the raw zipped dataset.
    processed_data_dir: directory in which the processed data sets will be stored.
    vocabulary_size: size of the vocabulary to create and use if no vocabulary file found in rawdata. Otherwise, use supplied vocabulary file.
    task_type: different task_type has slightly different rawdata format, and need different treatment
               for classification task, usually has TrainPairs, EvalPairs, targetSpaceID file
               for search task,
               for cross-lingual search tasks,
               for question answer tasks,
    max_seq_length: max number of tokens  of a single source/target sequence
  Returns:
    A tuple of 5 elements:
      (1) path to encoded TrainPairs: targetID, Sequence of source token IDs
      (2) path to encoded EvalPairs: targetID, Sequence of source token IDs
      (3) path to encoded full TargetSpaces: targetID, Sequence of target token IDs
      (4) path to the source vocabulary file,
      (5) path to the target vocabulary file.
  """
  # extract corpus to the specified processed directory.
  get_data_set(raw_data_dir, processed_data_dir)

  # generate vocab file if not available, otherwise, use supplied vocab file for encoder
  vocabFile = processed_data_dir + '/vocabulary.txt'
  if  gfile.Exists( vocabFile ):
    print("Loading supplied vocabluary file: %s" % vocabFile )
    encoder = text_encoder.SubwordTextEncoder(filename=vocabFile)
    print("Total vocab size is: %d" % encoder.vocab_size )
  else:
    print("No supplied vocabulary file found. Build new vocabulary based on training data ....")
    token_counts = tokenizer.corpus_token_counts( processed_data_dir + '/*.Corpus', 1000000, split_on_newlines=True)
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size( vocabulary_size, token_counts, 2, 1000 )
    encoder.store_to_file(vocabFile)
    print("New vocabulary constructed.")

  # create training corpus and evaluation corpus per task_type
  if task_type.lower().strip() == "classification":
    train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_classification_corpus( processed_data_dir, encoder, max_seq_length)
  elif task_type.lower().strip() in ["ranking", "crosslingual" ]:
    train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_search_corpus( processed_data_dir, encoder,  max_seq_length)
  elif task_type.lower().strip()  == "qna":
    train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_questionAnswer_corpus(processed_data_dir, encoder, max_seq_length)
  else:
    raise ValueError("Unsupported task_type. Please use one of: classification, search, crosslanguages, questionanswer")

  return encoder, train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap
Ejemplo n.º 8
0
def vocab_extend(corpus,
                 raw_vocab,
                 output_filename,
                 interval=10000,
                 threshold=0.01):
    """
    @description  : The function to get the incremental vocabulary for 
    
    @param  :
    
    @Returns  :
    
    """

    documents = []
    for line in open(corpus, "r", encoding='utf-8'):
        line = line.replace('\n', '')
        if len(line) < 5:
            continue
        documents.append(line)
    print("docunments: " + str(len(documents)))
    token_counts = tokenizer.corpus_token_counts(corpus,
                                                 corpus_max_lines=4400000,
                                                 split_on_newlines=True,
                                                 additional_chars="",
                                                 do_lower_case=True)
    lines = open(raw_vocab, 'r', encoding='utf-8').readlines()
    lines = [s.strip() for s in lines if len(s) > 0]
    reserved_tokens = lines
    random.shuffle(documents)
    origin_size = (len(reserved_tokens) // interval) * interval
    pre_lm = compute_language_model(documents, raw_vocab)
    print("origin_size: " + str(origin_size))
    print("pre_lm: " + str(pre_lm))
    target_size = origin_size
    while True:
        target_size = target_size + interval
        _, temp_vocab = build_target_size_vocab(token_counts, reserved_tokens,
                                                target_size)
        now_lm = compute_language_model(documents, temp_vocab)
        print('now_lm: ' + str(now_lm))
        delta = (pre_lm - now_lm) / pre_lm
        print('delta: ' + str(delta))
        if delta <= threshold:
            merge_output_file_with_bert_vocab(output_filename, raw_vocab,
                                              temp_vocab)
            break
        pre_lm = now_lm