def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % (tokenization.printable_text( self.question_text)) s += ", paragraph_text: [%s]" % (" ".join(self.paragraph_text)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.start_position: s += ", is_impossible: %r" % (self.is_impossible) return s
def __str__(self): s = "" s += "tokens: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.tokens])) s += "segment_ids: %s\n" % (" ".join( [str(x) for x in self.segment_ids])) s += "is_random_next: %s\n" % self.is_random_next s += "masked_lm_positions: %s\n" % (" ".join( [str(x) for x in self.masked_lm_positions])) s += "masked_lm_labels: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.masked_lm_labels])) s += "\n" return s
def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, output_fn, do_lower_case, batch_size=None): """Loads a data file into a list of `InputBatch`s.""" cnt_pos, cnt_neg = 0, 0 base_id = 1000000000 unique_id = base_id max_n, max_m = 1024, 1024 f = np.zeros((max_n, max_m), dtype=np.float32) for (example_index, example) in enumerate(examples): if example_index % 100 == 0: logging.info("Converting %d/%d pos %d neg %d", example_index, len(examples), cnt_pos, cnt_neg) query_tokens = tokenization.encode_ids( tokenizer.sp_model, tokenization.preprocess_text(example.question_text, lower=do_lower_case)) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] paragraph_text = example.paragraph_text para_tokens = tokenization.encode_pieces( tokenizer.sp_model, tokenization.preprocess_text(example.paragraph_text, lower=do_lower_case)) chartok_to_tok_index = [] tok_start_to_chartok_index = [] tok_end_to_chartok_index = [] char_cnt = 0 for i, token in enumerate(para_tokens): new_token = token.replace(tokenization.SPIECE_UNDERLINE, " ") chartok_to_tok_index.extend([i] * len(new_token)) tok_start_to_chartok_index.append(char_cnt) char_cnt += len(new_token) tok_end_to_chartok_index.append(char_cnt - 1) tok_cat_text = "".join(para_tokens).replace( tokenization.SPIECE_UNDERLINE, " ") n, m = len(paragraph_text), len(tok_cat_text) if n > max_n or m > max_m: max_n = max(n, max_n) max_m = max(m, max_m) f = np.zeros((max_n, max_m), dtype=np.float32) g = {} # pylint: disable=cell-var-from-loop def _lcs_match(max_dist, n=n, m=m): """Longest-common-substring algorithm.""" f.fill(0) g.clear() ### longest common sub sequence # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j)) for i in range(n): # unlike standard LCS, this is specifically optimized for the setting # because the mismatch between sentence pieces and original text will # be small for j in range(i - max_dist, i + max_dist): if j >= m or j < 0: continue if i > 0: g[(i, j)] = 0 f[i, j] = f[i - 1, j] if j > 0 and f[i, j - 1] > f[i, j]: g[(i, j)] = 1 f[i, j] = f[i, j - 1] f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0 if (tokenization.preprocess_text(paragraph_text[i], lower=do_lower_case, remove_space=False) == tok_cat_text[j] and f_prev + 1 > f[i, j]): g[(i, j)] = 2 f[i, j] = f_prev + 1 # pylint: enable=cell-var-from-loop max_dist = abs(n - m) + 5 for _ in range(2): _lcs_match(max_dist) if f[n - 1, m - 1] > 0.8 * n: break max_dist *= 2 orig_to_chartok_index = [None] * n chartok_to_orig_index = [None] * m i, j = n - 1, m - 1 while i >= 0 and j >= 0: if (i, j) not in g: break if g[(i, j)] == 2: orig_to_chartok_index[i] = j chartok_to_orig_index[j] = i i, j = i - 1, j - 1 elif g[(i, j)] == 1: j = j - 1 else: i = i - 1 if (all(v is None for v in orig_to_chartok_index) or f[n - 1, m - 1] < 0.8 * n): logging.info("MISMATCH DETECTED!") continue tok_start_to_orig_index = [] tok_end_to_orig_index = [] for i in range(len(para_tokens)): start_chartok_pos = tok_start_to_chartok_index[i] end_chartok_pos = tok_end_to_chartok_index[i] start_orig_pos = _convert_index(chartok_to_orig_index, start_chartok_pos, n, is_start=True) end_orig_pos = _convert_index(chartok_to_orig_index, end_chartok_pos, n, is_start=False) tok_start_to_orig_index.append(start_orig_pos) tok_end_to_orig_index.append(end_orig_pos) if not is_training: tok_start_position = tok_end_position = None if is_training and example.is_impossible: tok_start_position = 0 tok_end_position = 0 if is_training and not example.is_impossible: start_position = example.start_position end_position = start_position + len(example.orig_answer_text) - 1 start_chartok_pos = _convert_index(orig_to_chartok_index, start_position, is_start=True) tok_start_position = chartok_to_tok_index[start_chartok_pos] end_chartok_pos = _convert_index(orig_to_chartok_index, end_position, is_start=False) tok_end_position = chartok_to_tok_index[end_chartok_pos] assert tok_start_position <= tok_end_position def _piece_to_id(x): return tokenizer.sp_model.PieceToId(x) all_doc_tokens = list(map(_piece_to_id, para_tokens)) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_is_max_context = {} segment_ids = [] cur_tok_start_to_orig_index = [] cur_tok_end_to_orig_index = [] tokens.append(tokenizer.sp_model.PieceToId("[CLS]")) segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append(tokenizer.sp_model.PieceToId("[SEP]")) segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i cur_tok_start_to_orig_index.append( tok_start_to_orig_index[split_token_index]) cur_tok_end_to_orig_index.append( tok_end_to_orig_index[split_token_index]) is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append(tokenizer.sp_model.PieceToId("[SEP]")) segment_ids.append(1) paragraph_len = len(tokens) input_ids = tokens # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length span_is_impossible = example.is_impossible start_position = None end_position = None if is_training and not span_is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: # continue start_position = 0 end_position = 0 span_is_impossible = True else: doc_offset = len(query_tokens) + 2 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset if is_training and span_is_impossible: start_position = 0 end_position = 0 if example_index < 20: logging.info("*** Example ***") logging.info("unique_id: %s", (unique_id)) logging.info("example_index: %s", (example_index)) logging.info("doc_span_index: %s", (doc_span_index)) logging.info( "tok_start_to_orig_index: %s", " ".join([str(x) for x in cur_tok_start_to_orig_index])) logging.info( "tok_end_to_orig_index: %s", " ".join([str(x) for x in cur_tok_end_to_orig_index])) logging.info( "token_is_max_context: %s", " ".join([ "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() ])) logging.info( "input_pieces: %s", " ".join([tokenizer.sp_model.IdToPiece(x) for x in tokens])) logging.info("input_ids: %s", " ".join([str(x) for x in input_ids])) logging.info("input_mask: %s", " ".join([str(x) for x in input_mask])) logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) if is_training and span_is_impossible: logging.info("impossible example span") if is_training and not span_is_impossible: pieces = [ tokenizer.sp_model.IdToPiece(token) for token in tokens[start_position:(end_position + 1)] ] answer_text = tokenizer.sp_model.DecodePieces(pieces) logging.info("start_position: %d", (start_position)) logging.info("end_position: %d", (end_position)) logging.info("answer: %s", (tokenization.printable_text(answer_text))) # With multi processing, the example_index is actually the index # within the current process therefore we use example_index=None # to avoid being used in the future. # The current code does not use example_index of training data. if is_training: feat_example_index = None else: feat_example_index = example_index feature = InputFeatures( unique_id=unique_id, example_index=feat_example_index, doc_span_index=doc_span_index, tok_start_to_orig_index=cur_tok_start_to_orig_index, tok_end_to_orig_index=cur_tok_end_to_orig_index, token_is_max_context=token_is_max_context, tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens], input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, paragraph_len=paragraph_len, start_position=start_position, end_position=end_position, is_impossible=span_is_impossible) # Run callback if is_training: output_fn(feature) else: output_fn(feature, is_padding=False) unique_id += 1 if span_is_impossible: cnt_neg += 1 else: cnt_pos += 1 if not is_training and feature: assert batch_size num_padding = 0 num_examples = unique_id - base_id if unique_id % batch_size != 0: num_padding = batch_size - (num_examples % batch_size) dummy_feature = copy.deepcopy(feature) for _ in range(num_padding): dummy_feature.unique_id = unique_id # Run callback output_fn(feature, is_padding=True) unique_id += 1 logging.info("Total number of instances: %d = pos %d neg %d", cnt_pos + cnt_neg, cnt_pos, cnt_neg) return unique_id - base_id
def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, output_fn, batch_size=None): """Loads a data file into a list of `InputBatch`s.""" base_id = 1000000000 unique_id = base_id feature = None for (example_index, example) in enumerate(examples): query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None if is_training and example.is_impossible: tok_start_position = -1 tok_end_position = -1 if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len( tokens)] = tok_to_orig_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length start_position = None end_position = None if is_training and not example.is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = 0 end_position = 0 else: doc_offset = len(query_tokens) + 2 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset if is_training and example.is_impossible: start_position = 0 end_position = 0 if example_index < 20: logging.info("*** Example ***") logging.info("unique_id: %s", (unique_id)) logging.info("example_index: %s", (example_index)) logging.info("doc_span_index: %s", (doc_span_index)) logging.info( "tokens: %s", " ".join([tokenization.printable_text(x) for x in tokens])) logging.info( "token_to_orig_map: %s", " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map) ])) logging.info( "token_is_max_context: %s", " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) logging.info("input_ids: %s", " ".join([str(x) for x in input_ids])) logging.info("input_mask: %s", " ".join([str(x) for x in input_mask])) logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) if is_training and example.is_impossible: logging.info("impossible example") if is_training and not example.is_impossible: answer_text = " ".join( tokens[start_position:(end_position + 1)]) logging.info("start_position: %d", (start_position)) logging.info("end_position: %d", (end_position)) logging.info("answer: %s", tokenization.printable_text(answer_text)) feature = InputFeatures(unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=tokens, token_to_orig_map=token_to_orig_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, start_position=start_position, end_position=end_position, is_impossible=example.is_impossible) # Run callback if is_training: output_fn(feature) else: output_fn(feature, is_padding=False) unique_id += 1 if not is_training and feature: assert batch_size num_padding = 0 num_examples = unique_id - base_id if unique_id % batch_size != 0: num_padding = batch_size - (num_examples % batch_size) logging.info("Adding padding examples to make sure no partial batch.") logging.info("Adds %d padding examples for inference.", num_padding) dummy_feature = copy.deepcopy(feature) for _ in range(num_padding): dummy_feature.unique_id = unique_id # Run callback output_fn(feature, is_padding=True) unique_id += 1 return unique_id - base_id
def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files): """Create TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append( tf.io.TFRecordWriter( output_file, options="GZIP" if FLAGS.gzip_compress else "")) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) assert len(input_ids) <= max_seq_length while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids( instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) while len(masked_lm_positions) < max_predictions_per_seq: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) next_sentence_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) features["masked_lm_positions"] = create_int_feature( masked_lm_positions) features["masked_lm_ids"] = create_int_feature(masked_lm_ids) features["masked_lm_weights"] = create_float_feature(masked_lm_weights) features["next_sentence_labels"] = create_int_feature( [next_sentence_label]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: logging.info("*** Example ***") logging.info( "tokens: %s", " ".join( [tokenization.printable_text(x) for x in instance.tokens])) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value logging.info("%s: %s", feature_name, " ".join([str(x) for x in values])) for writer in writers: writer.close() logging.info("Wrote %d total instances", total_written)
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 5: logging.info("*** Example ***") logging.info("guid: %s", (example.guid)) logging.info( "tokens: %s", " ".join([tokenization.printable_text(x) for x in tokens])) logging.info("input_ids: %s", " ".join([str(x) for x in input_ids])) logging.info("input_mask: %s", " ".join([str(x) for x in input_mask])) logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) logging.info("label: %s (id = %d)", example.label, label_id) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature