def process_text(self, text): if self.use_spm: return tokenization.preprocess_text(text, lower=self.do_lower_case) else: return tokenization.convert_to_unicode(text)
def create_training_instances( input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng, ): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with tf.gfile.GFile(input_file, FLAGS.input_file_mode) as reader: while True: line = reader.readline() if not FLAGS.spm_model_file: line = tokenization.convert_to_unicode(line) if not line: break if FLAGS.spm_model_file: line = tokenization.preprocess_text( line, lower=FLAGS.do_lower_case) else: line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document( all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, )) rng.shuffle(instances) return instances