def __str__(self): s = "" s += "tokens: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.tokens])) s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) s += "token_boundary: %s\n" % (" ".join( [str(x) for x in self.token_boundary])) s += "is_random_next: %s\n" % self.is_random_next s += "masked_lm_positions: %s\n" % (" ".join( [str(x) for x in self.masked_lm_positions])) s += "masked_lm_labels: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.masked_lm_labels])) s += "\n" return s
def example_to_token_ids_segment_ids_label_ids( ex_index, example, max_seq_length, tokenizer): """Converts an ``InputExample`` to token ids and segment ids.""" if ex_index < 5: tf.logging.info("*** Example {ex_index} ***") tf.logging.info("qid: %s" % (example.qid)) question_tokens = tokenizer.tokenize(example.question) answers_tokens = map(tokenizer.tokenize, example.answers) token_ids = [] segment_ids = [] for choice_idx, answer_tokens in enumerate(answers_tokens): truncated_question_tokens = question_tokens[ :max((max_seq_length - 3)//2, max_seq_length - (len(answer_tokens) + 3))] truncated_answer_tokens = answer_tokens[ :max((max_seq_length - 3)//2, max_seq_length - (len(question_tokens) + 3))] choice_tokens = [] choice_segment_ids = [] choice_tokens.append("[CLS]") choice_segment_ids.append(0) for question_token in truncated_question_tokens: choice_tokens.append(question_token) choice_segment_ids.append(0) choice_tokens.append("[SEP]") choice_segment_ids.append(0) for answer_token in truncated_answer_tokens: choice_tokens.append(answer_token) choice_segment_ids.append(1) choice_tokens.append("[SEP]") choice_segment_ids.append(1) choice_token_ids = tokenizer.convert_tokens_to_ids(choice_tokens) token_ids.append(choice_token_ids) segment_ids.append(choice_segment_ids) if ex_index < 5: tf.logging.info("choice %s" % choice_idx) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(t) for t in choice_tokens])) tf.logging.info("token ids: %s" % " ".join( [str(x) for x in choice_token_ids])) tf.logging.info("segment ids: %s" % " ".join( [str(x) for x in choice_segment_ids])) label_ids = [example.label] if ex_index < 5: tf.logging.info("label: %s (id = %d)" % (example.label, label_ids[0])) return token_ids, segment_ids, label_ids
def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files): """Create TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append(tf.python_io.TFRecordWriter(output_file)) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) token_boundary = list(instance.token_boundary) assert len(input_ids) <= max_seq_length while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) token_boundary.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids( instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) multiplier = 1 + int(FLAGS.do_permutation) while len(masked_lm_positions) < max_predictions_per_seq * multiplier: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) sentence_order_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) features["token_boundary"] = create_int_feature(token_boundary) features["masked_lm_positions"] = create_int_feature( masked_lm_positions) features["masked_lm_ids"] = create_int_feature(masked_lm_ids) features["masked_lm_weights"] = create_float_feature(masked_lm_weights) # Note: We keep this feature name `next_sentence_labels` to be compatible # with the original data created by lanzhzh@. However, in the ALBERT case # it does contain sentence_order_label. features["next_sentence_labels"] = create_int_feature( [sentence_order_label]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: tf.logging.info("*** Example ***") tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in instance.tokens])) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value tf.logging.info( "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) for writer in writers: writer.close() tf.logging.info("Wrote %d total instances", total_written)
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, task_name): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False) if task_name != "sts-b": label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in ALBERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length if task_name != "sts-b": label_id = label_map[example.label] else: label_id = example.label if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature
def convert_examples_to_features(self, seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" # features = [] input_masks = [] examples = self._to_example(self.input_queue.get()) for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) # if the sentences's length is more than seq_length, only use sentence's left part if len(tokens_a) > seq_length - 2: tokens_a = tokens_a[0:(seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) # Where "input_ids" are tokens's index in vocabulary input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) input_masks.append(input_mask) # Zero-pad up to the sequence length. while len(input_ids) < seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == seq_length assert len(input_mask) == seq_length assert len(segment_ids) == seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) yield InputFeatures(guid=example.guid, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids)