def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for i, line in enumerate(lines): guid = f'{set_type}-{i}' text_a = tokenization.convert_to_unicode( line[REQUIRED_COLUMNS.index('text')]) if set_type == 'test': label = '0' else: label = tokenization.convert_to_unicode( line[REQUIRED_COLUMNS.index('label')]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training/dev/test sets.""" examples = [] for i, line in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, 1) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) label = "entailment" else: text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) label = tokenization.convert_to_unicode(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training/dev/test sets.""" examples = [] for i, line in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[7]) text_b = tokenization.convert_to_unicode(line[8]) if set_type == "test": label = 0.0 else: label = self.label_type( tokenization.convert_to_unicode(line[9])) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng, do_whole_word_mask=False): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with tf.io.gfile.GFile(input_file, "rb") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document(all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask)) rng.shuffle(instances) return instances
def _tokenize(self, utterance): """ Tokenize the utterance using word-piece tokenization used by BERT. Args: utterance: A string containing the utterance to be tokenized. Returns: bert_tokens: A list of tokens obtained by word-piece tokenization of the utterance. alignments: A dict mapping indices of characters corresponding to start and end positions of words (not subwords) to corresponding indices in bert_tokens list. inverse_alignments: A list of size equal to bert_tokens. Each element is a tuple containing the index of the starting and inclusive ending character of the word corresponding to the subword. This list is used during inference to map word-piece indices to spans in the original utterance. """ utterance = tokenization.convert_to_unicode(utterance) # After _naive_tokenize, spaces and punctuation marks are all retained, i.e. # direct concatenation of all the tokens in the sequence will be the # original string. tokens = _naive_tokenize(utterance) # Filter out empty tokens and obtain aligned character index for each token. alignments = {} char_index = 0 bert_tokens = [] # These lists store inverse alignments to be used during inference. bert_tokens_start_chars = [] bert_tokens_end_chars = [] for token in tokens: if token.strip(): subwords = self._tokenizer.tokenize(token) # Store the alignment for the index of starting character and the # inclusive ending character of the token. alignments[char_index] = len(bert_tokens) bert_tokens_start_chars.extend([char_index] * len(subwords)) bert_tokens.extend(subwords) # The inclusive ending character index corresponding to the word. inclusive_char_end = char_index + len(token) - 1 alignments[inclusive_char_end] = len(bert_tokens) - 1 bert_tokens_end_chars.extend([inclusive_char_end] * len(subwords)) char_index += len(token) inverse_alignments = list( zip(bert_tokens_start_chars, bert_tokens_end_chars)) return bert_tokens, alignments, inverse_alignments
def create_training_instances(input_dir, tokenizer, max_seq_length, max_gapped_tokens, dupe_factor, rng): """ Create training instances from multiple documents. The data format is (1) Each document is in it's own file. (2) Each sentence is in it's own line. """ all_documents = [] for file in os.listdir(input_dir): with open(os.path.join(input_dir, file), "r+") as reader: all_documents.append([]) for line in reader.readlines(): line = tokenization.convert_to_unicode(line) line = line.strip() if not line: continue tokens = tokenizer.tokenize(line) all_documents[-1].append(tokens) all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) instances = [] for _ in range(dupe_factor): for i, doc in enumerate(all_documents): instances.extend( create_instances_from_document( all_documents[i], max_seq_length, max_gapped_tokens, rng)) rng.shuffle(instances) return instances
def preprocess_and_tokenize_input_files( input_files: Iterable[str], tokenizer: tokenization.FullSentencePieceTokenizer, use_eod: bool = True, do_lower_case: bool = False, log_example_freq: int = 100000) -> List[Tuple[np.array, np.array]]: """Preprocesses and encodes raw text from input files. This function preprocesses raw text and encodes them into tokens using a `SentencePieceModel` tokenization method. This also provides the sentence indicator for each token. Args: input_files: The list of input file names. tokenizer: The SentencePiece tokenizer that has the attribute `sp_model`. use_eod: Whether or not to use an EOD indicator. If `False`, then EOD is not included. do_lower_case: Whether or not to apply lower casing during raw text preprocessing. log_example_freq: The optional field for how many lines to process before emitting an info log. Returns: The preprocessed list. Each entry in the list is a tuple consisting of the token IDs and the sentence IDs. """ all_data = [] eod_symbol = special_symbols["<eod>"] total_number_of_lines = 0 # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: line_count = 0 logging.info("Preprocessing %s", input_file) all_tokens = [] all_sentence_ids = [] sentence_id = True with tf.io.gfile.GFile(input_file, "rb") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line_count += 1 if line_count % log_example_freq == 0: logging.info("Loading line %d", line_count) line = line.strip() if not line: if use_eod: token_ids = [eod_symbol] sentence_id = not sentence_id else: continue else: preprocessed_line = _preprocess_line( line=line, do_lower_case=do_lower_case) token_ids = tokenization.encode_ids( sp_model=tokenizer.sp_model, text=preprocessed_line) all_tokens.extend(token_ids) all_sentence_ids.extend([sentence_id] * len(token_ids)) sentence_id = not sentence_id logging.info("Finished processing %s. Number of lines: %d", input_file, line_count) if line_count == 0: continue total_number_of_lines += line_count all_tokens = np.array(all_tokens, dtype=np.int64) all_sentence_ids = np.array(all_sentence_ids, dtype=np.bool) all_data.append((all_tokens, all_sentence_ids)) logging.info("Completed text preprocessing. Total number of lines: %d", total_number_of_lines) return all_data
def process(input_file, tokenizer, rng, args): # logging logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)-5.5s] [%(name)-12.12s]: %(message)s') logger = logging.getLogger(__name__) # read & tokenize docs all_documents = [[]] logger.info('Tokenizing documents...') num_logged_examples = 0 with tf.io.gfile.GFile(input_file, 'rb') as reader: num_lines = sum(1 for _ in reader.readline()) pbar = tqdm(total=num_lines, desc='Tokenization') with tf.io.gfile.GFile(input_file, 'rb') as reader: i = 0 while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) if num_logged_examples < args.num_logged_samples: print('**** Tokenization example ****') print(line) print(tokens) print('****') num_logged_examples += 1 i += 1 pbar.update(i) # shuffle logger.info('Shuffling documents...') all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) num_documents = len(all_documents) logger.info(f'Tokenized a total of {num_documents:,} documents') # create instances logger.info('Creating instances...') vocab_words = list(tokenizer.vocab.keys()) instances = [] do_whole_word_masking = PRETRAINED_MODELS[ args.model_class]['do_whole_word_masking'] for _ in range(args.dupe_factor): for document_index in trange(len(all_documents), desc='Generating training instances'): instances.extend( create_instances_from_document( all_documents, document_index, args.max_seq_length, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, vocab_words, rng, do_whole_word_masking)) all_documents = None # free memory num_instances = len(instances) logger.info(f'Collected a total of {num_instances:,} training instances') logger.info('Shuffling training instances...') rng.shuffle(instances) # write tf records file _type = os.path.basename(os.path.dirname(input_file)) if _type in ['train', 'dev', 'test']: output_folder = os.path.join(DATA_DIR, 'pretrain', args.run_name, 'tfrecords', _type) else: _type = 'default' output_folder = os.path.join(DATA_DIR, 'pretrain', args.run_name, 'tfrecords') if not os.path.isdir(output_folder): os.makedirs(output_folder) input_file_name = os.path.basename(input_file) output_file = os.path.join(output_folder, f'{input_file_name}.tfrecords') logger.info(f'Writing to {output_file}...') write_instance_to_example_files(instances, tokenizer, args.max_seq_length, args.max_predictions_per_seq, [output_file], args.gzipped) return num_documents, num_instances, _type