def __str__(self): s = "" s += "tokens: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.tokens])) s += "segment_ids: %s\n" % (" ".join( [str(x) for x in self.segment_ids])) s += "is_random_next: %s\n" % self.is_random_next s += "masked_lm_positions: %s\n" % (" ".join( [str(x) for x in self.masked_lm_positions])) s += "masked_lm_labels: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.masked_lm_labels])) s += "\n" return s
def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files): """Create TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append(tf.io.TFRecordWriter(output_file)) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) assert len(input_ids) <= max_seq_length while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids( instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) while len(masked_lm_positions) < max_predictions_per_seq: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) next_sentence_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) features["masked_lm_positions"] = create_int_feature( masked_lm_positions) features["masked_lm_ids"] = create_int_feature(masked_lm_ids) features["masked_lm_weights"] = create_float_feature(masked_lm_weights) features["next_sentence_labels"] = create_int_feature( [next_sentence_label]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: tf.get_logger().info("*** Example ***") tf.get_logger().info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in instance.tokens])) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value tf.get_logger().info( "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) for writer in writers: writer.close() tf.get_logger().info("Wrote %d total instances", total_written)
def convert_examples_to_features(examples, seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" features = [] for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > seq_length - 2: tokens_a = tokens_a[0:(seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] input_type_ids = [] tokens.append("[CLS]") input_type_ids.append(0) for token in tokens_a: tokens.append(token) input_type_ids.append(0) tokens.append("[SEP]") input_type_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) input_type_ids.append(1) tokens.append("[SEP]") input_type_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < seq_length: input_ids.append(0) input_mask.append(0) input_type_ids.append(0) assert len(input_ids) == seq_length assert len(input_mask) == seq_length assert len(input_type_ids) == seq_length if ex_index < 5: absl.logging.info("*** Example ***") absl.logging.info("unique_id: %s" % (example.unique_id)) absl.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) absl.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) absl.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) absl.logging.info("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) features.append( InputFeatures(unique_id=example.unique_id, tokens=tokens, input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids)) return features
def _convert_single_example(self, ex_index, example): label_map = {} for (i, label) in enumerate(self.get_labels()): label_map[label] = i tokens_a = self.tokenizer.tokenize(example.text_a) tokens_b = self.tokenizer.tokenize(example.text_b) if tokens_a: # Account for [CLS] and [SEP] with "- 2" tokens_a = tokens_a[0:(self.max_seq_length - 2)] if tokens_b: # Account for [CLS] and [SEP] with "- 2" tokens_b = tokens_b[0:(self.max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] tokens.append("[CLS]") for token in tokens_a: tokens.append(token) tokens.append("[SEP]") input_ids_a = self.tokenizer.convert_tokens_to_ids(tokens) input_mask_a = [1] * len(input_ids_a) tokens = [] tokens.append("[CLS]") if tokens_b: for token in tokens_b: tokens.append(token) tokens.append("[SEP]") input_ids_b = self.tokenizer.convert_tokens_to_ids(tokens) input_mask_b = [1] * len(input_ids_b) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. # Zero-pad up to the sequence length. while len(input_ids_a) < self.max_seq_length: input_ids_a.append(0) input_mask_a.append(0) while len(input_ids_b) < self.max_seq_length: input_ids_b.append(0) input_mask_b.append(0) assert len(input_ids_a) == self.max_seq_length assert len(input_ids_b) == self.max_seq_length label_id = label_map[example.label] if ex_index < 5: tf.get_logger().info("*** Example ***") tf.get_logger().info("guid: %s" % (example.guid)) tf.get_logger().info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.get_logger().info("input_ids_a: %s" % " ".join([str(x) for x in input_ids_a])) tf.get_logger().info("input_mask_a: %s" % " ".join([str(x) for x in input_mask_a])) tf.get_logger().info("input_ids_b: %s" % " ".join([str(x) for x in input_ids_b])) tf.get_logger().info("input_mask_b: %s" % " ".join([str(x) for x in input_mask_b])) tf.get_logger().info("label: %s (id = %d)" % (example.label, label_id)) feature = SiameseFeatures( input_ids_a=input_ids_a, input_mask_a=input_mask_a, input_ids_b=input_ids_b, input_mask_b=input_mask_b, label_id=label_id) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures( input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 5: logging.info("*** Example ***") logging.info("guid: %s" % (example.guid)) logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature
def write_instance_to_example_files(self, instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files): """Create TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append(tf.io.TFRecordWriter(output_file)) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) segment_ids = list(instance.segment_ids) input_mask = [1] * len(instance.tokens) origin_input_ids_length = len(input_ids) - 2 * len( instance.masked_lm_positions) output_tokens_positions = instance.output_tokens_positions assert origin_input_ids_length <= max_seq_length #while origin_input_ids_length < max_seq_length: while len(input_ids) < (self.max_seq_length + 2 * self.max_predictions_per_seq): input_ids.append(0) input_mask.append(0) output_tokens_positions.append(0) segment_ids.append(0) origin_input_ids_length = len( input_ids) - 2 * self.max_predictions_per_seq assert origin_input_ids_length == max_seq_length assert len(input_mask) - ( 2 * self.max_predictions_per_seq) == max_seq_length assert len(segment_ids) - ( 2 * self.max_predictions_per_seq) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids( instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) pseudo_masked_lm_positions = list( instance.pseudo_masked_lm_positions) pseudo_masked_lm_ids = [ tokenizer.convert_tokens_to_ids(_) for _ in instance.pseudo_masked_lm_labels ] pseudo_masked_index = instance.pseudo_index masked_index = instance.masked_index pseudo_masked_sub_list_len = [ len(sub_list) for sub_list in pseudo_masked_lm_positions ] while len(masked_lm_positions) < max_predictions_per_seq: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) pseudo_masked_lm_positions.append([0]) pseudo_masked_index.append([0]) masked_index.append(0) pseudo_masked_lm_ids.append([0]) while len(pseudo_masked_sub_list_len) < max_predictions_per_seq: pseudo_masked_sub_list_len.append(0) next_sentence_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features["input_ids"] = about_tfrecord.create_int_feature( input_ids) # convert array to a byte_feature features["input_mask"] = about_tfrecord.create_int_feature( input_mask) flatten_pseudo_masked_lm_positions = [ _ for sub_list in pseudo_masked_lm_positions for _ in sub_list ] #features["pseudo_masked_lm_positions"] = about_tfrecord.create_int_feature(flatten_pseudo_masked_lm_positions) features[ "pseudo_masked_sub_list_len"] = about_tfrecord.create_int_feature( pseudo_masked_sub_list_len) flatten_pseudo_masked_index = [ _ for sub_list in pseudo_masked_index for _ in sub_list ] features[ "pseudo_masked_index"] = about_tfrecord.create_int_feature( flatten_pseudo_masked_index) features["masked_index"] = about_tfrecord.create_int_feature( masked_index) flatten_pseudo_masked_lm_ids = [ _ for sub_list in pseudo_masked_lm_ids for _ in sub_list ] features[ "pseudo_masked_lm_ids"] = about_tfrecord.create_int_feature( flatten_pseudo_masked_lm_ids) features[ "output_tokens_positions"] = about_tfrecord.create_int_feature( output_tokens_positions) features["segment_ids"] = about_tfrecord.create_int_feature( segment_ids) #features["masked_lm_positions"] = about_tfrecord.create_int_feature(masked_lm_positions) features["masked_lm_ids"] = about_tfrecord.create_int_feature( masked_lm_ids) features[ "masked_lm_weights"] = about_tfrecord.create_float_feature( masked_lm_weights) features[ "next_sentence_labels"] = about_tfrecord.create_int_feature( [next_sentence_label]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: tf.get_logger().info("*** Example ***") tf.get_logger().info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in instance.tokens])) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value tf.get_logger().info( "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) for writer in writers: writer.close() tf.get_logger().info("Wrote %d total instances", total_written)