def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, FLAGS.do_lower, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.do_lower, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts(token_counts, FLAGS.min_count, FLAGS.num_iterations) encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
def main(unused_argv): if FLAGS.log_level not in ['DEBUG', 'INFO', 'ERROR']: raise ValueError('Set verbosity among "DEBUG", "INFO", "ERROR"') tf.logging.set_verbosity(FLAGS.log_level) if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines, additional_chars=FLAGS.additional_chars) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts( token_counts, FLAGS.min_count, FLAGS.num_iterations, max_subtoken_length=FLAGS.max_subtoken_length, backward=FLAGS.backward) encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') # encoder = text_encoder.SubwordTextEncoder() # encoder.build_from_token_counts(token_counts, FLAGS.min_count, # FLAGS.num_iterations) # encoder.build_to_target_size(32000 , token_counts, FLAGS.min_count, 5 * FLAGS.min_count) # encoder.build_to_target_size(32000, token_counts, 2, 10000) encoder = text_encoder.SubwordTextEncoder.build_to_target_size( 32000, token_counts, 1, 1000) encoder.store_to_file(FLAGS.output_filename)
def __init__(self, work_dir, rawdata_dir, rawvocabsize, max_seq_length): json_path = work_dir + '/compressed' if os.path.exists(json_path): # load data from json print('loading saved json data from %s' % json_path) with open(json_path, 'r') as fin: gdict = json.load(fin) for name, val in gdict.items(): setattr(self, name, val) # setup encoder from vocabulary file vocabFile = work_dir + '/vocabulary.txt' if os.path.exists(vocabFile): print("Loading supplied vocabluary file: %s" % vocabFile) encoder = text_encoder.SubwordTextEncoder(filename=vocabFile) print("Total vocab size is: %d" % encoder.vocab_size) else: print( "No supplied vocabulary file found. Build new vocabulary based on training data ...." ) token_counts = tokenizer.corpus_token_counts( work_dir + '/*.Corpus', 2000000, split_on_newlines=True) encoder = text_encoder.SubwordTextEncoder.build_to_target_size( rawvocabsize, token_counts, 2, 1000) encoder.store_to_file(vocabFile) print("New vocabulary constructed.") self.encoder = encoder self.max_seq_length = int(self.max_seq_length) self.vocab_size = encoder.vocab_size print('-') print('Vocab size:', self.vocab_size, 'unique words') print('-') print('Max allowed sequence length:', self.max_seq_length) print('-') else: print('generating data from data path: %s' % rawdata_dir) encoder, trainCorpus, evalCorpus, encodedFullTargetSpace, tgtIdNameMap = data_utils.prepare_raw_data( rawdata_dir, work_dir, rawvocabsize, max_seq_length) self.encoder = encoder self.rawTrainPosCorpus = trainCorpus self.rawEvalCorpus = evalCorpus self.max_seq_length = max_seq_length self.encodedFullTargetSpace = encodedFullTargetSpace self.tgtIdNameMap = tgtIdNameMap self.vocab_size = encoder.vocab_size self.fullSetTargetIds = list(encodedFullTargetSpace.keys()) self.rawnegSetLen = len(self.fullSetTargetIds) print('-') print('Vocab size:', self.vocab_size, 'unique words') print('-') print('Max allowed sequence length:', self.max_seq_length) print('-') gdict = {} for name, attr in self.__dict__.items(): if not name.startswith("__") and name != 'encoder': if not callable(attr) and not type(attr) is staticmethod: gdict[name] = attr with open(json_path, 'w') as fout: json.dump(gdict, fout) print('Processed data dumped')
def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines, additional_chars=FLAGS.additional_chars, do_lower_case=FLAGS.do_lower_case) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines, FLAGS.do_lower_case) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') reserved_tokens = None if FLAGS.raw_vocab: lines = open(FLAGS.raw_vocab, 'r', encoding='utf-8').readlines() lines = [s.strip() for s in lines if len(s) > 0] reserved_tokens = lines print(len(token_counts)) print(len(reserved_tokens)) target_size = FLAGS.vocab_size if target_size <= len(reserved_tokens): raise ValueError( "The vocab_size must be larger than the origin vocab's size ") if target_size >= len(token_counts): raise ValueError( "The vocab_size is too large. Please set it smaller or prepare more corpus." ) min_val = 1 max_val = len(token_counts) // (target_size**0.5) fd, temp_path = tempfile.mkstemp() encoder = SubwordTextEncoder.build_to_target_size( target_size, token_counts, min_val, max_val, num_iterations=FLAGS.num_iterations, reserved_tokens=reserved_tokens, max_subtoken_length=FLAGS.max_subtoken_length) # encoder = SubwordTextEncoder() # encoder.build_from_token_counts(token_counts, FLAGS.min_count, # FLAGS.num_iterations, reserved_tokens=reserved_tokens, max_subtoken_length=FLAGS.max_subtoken_length) encoder.store_to_file(temp_path, add_single_quotes=False) merge_output_file_with_bert_vocab(FLAGS.output_filename, FLAGS.raw_vocab, temp_path)
def prepare_raw_data(raw_data_dir, processed_data_dir, vocabulary_size, neg_samples, max_seq_length): """ Get SSE training, and Evaluation related data, create tokenizer and vocabulary. :param raw_data_dir: :param processed_data_dir: :param vocabulary_size: :param neg_samples: :param max_seq_length: :return: """ # unzip corpus to the specified processed directory. get_data_set(raw_data_dir, processed_data_dir) # generate vocab file if not available, otherwise, use supplied vocab file for encoder vocabFile = processed_data_dir + '/vocabulary.txt' if gfile.Exists(vocabFile): print("Loading supplied vocabluary file: %s" % vocabFile) encoder = text_encoder.SubwordTextEncoder(filename=vocabFile) print("Total vocab size is: %d" % encoder.vocab_size) else: print("No supplied vocabulary file found. Build new vocabulary based on training data ....") token_counts = tokenizer.corpus_token_counts(processed_data_dir + '/*.Corpus', 1000000, split_on_newlines=True) encoder = text_encoder.SubwordTextEncoder.build_to_target_size(vocabulary_size, token_counts, 2, 1000) encoder.store_to_file(vocabFile) print("New vocabulary constructed.") # create encoded TargetSpace Data encodedFullTargetSpace = {} tgtIdNameMap = {} encodedFullTargetFile = codecs.open(os.path.join(processed_data_dir, "encoded.FullTargetSpace"), 'w', 'utf-8') for line in codecs.open(os.path.join(processed_data_dir, "targetIDs"), 'r', 'utf-8'): tgtSeq, id = line.strip().split('\t') token_ids = encoder.encode(tgtSeq.lower()) seqlen = len(token_ids) if seqlen > max_seq_length - 1: print( 'Error Detected!!! \n Target:\n %s \n Its seq length is:%d, which is longer than MAX_SEQ_LENTH of %d. Try to increase limit!!!!' % ( tgtSeq, seqlen, max_seq_length)) continue token_ids = token_ids + [text_encoder.EOS_ID] + [text_encoder.PAD_ID] * (max_seq_length - seqlen - 1) encodedFullTargetSpace[id] = token_ids tgtIdNameMap[id] = tgtSeq encodedFullTargetFile.write(id + '\t' + tgtSeq.strip() + '\t' + ','.join([str(i) for i in token_ids]) + '\n') encodedFullTargetFile.close() # creat positive Evaluation corpus: (source_tokens, verifiedTgtIds ) evalCorpus = gen_postive_corpus(os.path.join(processed_data_dir, "EvalPairs"), encodedFullTargetSpace, encoder, max_seq_length) # create positive Training Corpus: (source_tokens, verifiedTgtIds ) trainCorpus = gen_postive_corpus(os.path.join(processed_data_dir, "TrainPairs"), encodedFullTargetSpace, encoder, max_seq_length) return encoder, trainCorpus, evalCorpus, encodedFullTargetSpace, tgtIdNameMap
def prepare_raw_data(raw_data_dir, processed_data_dir, vocabulary_size, task_type, max_seq_length): """Get SSE training-Evaluation data into data_dir, create vocabularies and tokenized data. Args: raw_data_dir: directory contains the raw zipped dataset. processed_data_dir: directory in which the processed data sets will be stored. vocabulary_size: size of the vocabulary to create and use if no vocabulary file found in rawdata. Otherwise, use supplied vocabulary file. task_type: different task_type has slightly different rawdata format, and need different treatment for classification task, usually has TrainPairs, EvalPairs, targetSpaceID file for search task, for cross-lingual search tasks, for question answer tasks, max_seq_length: max number of tokens of a single source/target sequence Returns: A tuple of 5 elements: (1) path to encoded TrainPairs: targetID, Sequence of source token IDs (2) path to encoded EvalPairs: targetID, Sequence of source token IDs (3) path to encoded full TargetSpaces: targetID, Sequence of target token IDs (4) path to the source vocabulary file, (5) path to the target vocabulary file. """ # extract corpus to the specified processed directory. get_data_set(raw_data_dir, processed_data_dir) # generate vocab file if not available, otherwise, use supplied vocab file for encoder vocabFile = processed_data_dir + '/vocabulary.txt' if gfile.Exists( vocabFile ): print("Loading supplied vocabluary file: %s" % vocabFile ) encoder = text_encoder.SubwordTextEncoder(filename=vocabFile) print("Total vocab size is: %d" % encoder.vocab_size ) else: print("No supplied vocabulary file found. Build new vocabulary based on training data ....") token_counts = tokenizer.corpus_token_counts( processed_data_dir + '/*.Corpus', 1000000, split_on_newlines=True) encoder = text_encoder.SubwordTextEncoder.build_to_target_size( vocabulary_size, token_counts, 2, 1000 ) encoder.store_to_file(vocabFile) print("New vocabulary constructed.") # create training corpus and evaluation corpus per task_type if task_type.lower().strip() == "classification": train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_classification_corpus( processed_data_dir, encoder, max_seq_length) elif task_type.lower().strip() in ["ranking", "crosslingual" ]: train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_search_corpus( processed_data_dir, encoder, max_seq_length) elif task_type.lower().strip() == "qna": train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_questionAnswer_corpus(processed_data_dir, encoder, max_seq_length) else: raise ValueError("Unsupported task_type. Please use one of: classification, search, crosslanguages, questionanswer") return encoder, train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap
def vocab_extend(corpus, raw_vocab, output_filename, interval=10000, threshold=0.01): """ @description : The function to get the incremental vocabulary for @param : @Returns : """ documents = [] for line in open(corpus, "r", encoding='utf-8'): line = line.replace('\n', '') if len(line) < 5: continue documents.append(line) print("docunments: " + str(len(documents))) token_counts = tokenizer.corpus_token_counts(corpus, corpus_max_lines=4400000, split_on_newlines=True, additional_chars="", do_lower_case=True) lines = open(raw_vocab, 'r', encoding='utf-8').readlines() lines = [s.strip() for s in lines if len(s) > 0] reserved_tokens = lines random.shuffle(documents) origin_size = (len(reserved_tokens) // interval) * interval pre_lm = compute_language_model(documents, raw_vocab) print("origin_size: " + str(origin_size)) print("pre_lm: " + str(pre_lm)) target_size = origin_size while True: target_size = target_size + interval _, temp_vocab = build_target_size_vocab(token_counts, reserved_tokens, target_size) now_lm = compute_language_model(documents, temp_vocab) print('now_lm: ' + str(now_lm)) delta = (pre_lm - now_lm) / pre_lm print('delta: ' + str(delta)) if delta <= threshold: merge_output_file_with_bert_vocab(output_filename, raw_vocab, temp_vocab) break pre_lm = now_lm