def __str__(self):
     s = ""
     s += "tokens: %s\n" % (" ".join(
         [tokenization.printable_text(x) for x in self.tokens]))
     s += "segment_ids: %s\n" % (" ".join(
         [str(x) for x in self.segment_ids]))
     s += "is_random_next: %s\n" % self.is_random_next
     s += "masked_lm_positions: %s\n" % (" ".join(
         [str(x) for x in self.masked_lm_positions]))
     s += "masked_lm_labels: %s\n" % (" ".join(
         [tokenization.printable_text(x) for x in self.masked_lm_labels]))
     s += "\n"
     return s
def write_instance_to_example_files(instances, tokenizer, max_seq_length,
                                    max_predictions_per_seq, output_files):
    """Create TF example files from `TrainingInstance`s."""
    writers = []
    for output_file in output_files:
        writers.append(tf.io.TFRecordWriter(output_file))

    writer_index = 0

    total_written = 0
    for (inst_index, instance) in enumerate(instances):
        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
        input_mask = [1] * len(input_ids)
        segment_ids = list(instance.segment_ids)
        assert len(input_ids) <= max_seq_length

        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        masked_lm_positions = list(instance.masked_lm_positions)
        masked_lm_ids = tokenizer.convert_tokens_to_ids(
            instance.masked_lm_labels)
        masked_lm_weights = [1.0] * len(masked_lm_ids)

        while len(masked_lm_positions) < max_predictions_per_seq:
            masked_lm_positions.append(0)
            masked_lm_ids.append(0)
            masked_lm_weights.append(0.0)

        next_sentence_label = 1 if instance.is_random_next else 0

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(input_ids)
        features["input_mask"] = create_int_feature(input_mask)
        features["segment_ids"] = create_int_feature(segment_ids)
        features["masked_lm_positions"] = create_int_feature(
            masked_lm_positions)
        features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
        features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
        features["next_sentence_labels"] = create_int_feature(
            [next_sentence_label])

        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))

        writers[writer_index].write(tf_example.SerializeToString())
        writer_index = (writer_index + 1) % len(writers)

        total_written += 1

        if inst_index < 20:
            tf.get_logger().info("*** Example ***")
            tf.get_logger().info("tokens: %s" % " ".join(
                [tokenization.printable_text(x) for x in instance.tokens]))

            for feature_name in features.keys():
                feature = features[feature_name]
                values = []
                if feature.int64_list.value:
                    values = feature.int64_list.value
                elif feature.float_list.value:
                    values = feature.float_list.value
                tf.get_logger().info(
                    "%s: %s" %
                    (feature_name, " ".join([str(x) for x in values])))

    for writer in writers:
        writer.close()

    tf.get_logger().info("Wrote %d total instances", total_written)
def convert_examples_to_features(examples, seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > seq_length - 2:
                tokens_a = tokens_a[0:(seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        input_type_ids = []
        tokens.append("[CLS]")
        input_type_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            input_type_ids.append(0)
        tokens.append("[SEP]")
        input_type_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                input_type_ids.append(1)
            tokens.append("[SEP]")
            input_type_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)

        assert len(input_ids) == seq_length
        assert len(input_mask) == seq_length
        assert len(input_type_ids) == seq_length

        if ex_index < 5:
            absl.logging.info("*** Example ***")
            absl.logging.info("unique_id: %s" % (example.unique_id))
            absl.logging.info(
                "tokens: %s" %
                " ".join([tokenization.printable_text(x) for x in tokens]))
            absl.logging.info("input_ids: %s" %
                              " ".join([str(x) for x in input_ids]))
            absl.logging.info("input_mask: %s" %
                              " ".join([str(x) for x in input_mask]))
            absl.logging.info("input_type_ids: %s" %
                              " ".join([str(x) for x in input_type_ids]))

        features.append(
            InputFeatures(unique_id=example.unique_id,
                          tokens=tokens,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          input_type_ids=input_type_ids))
    return features
Beispiel #4
0
    def _convert_single_example(self, ex_index, example):
        label_map = {}
        for (i, label) in enumerate(self.get_labels()):
            label_map[label] = i

        tokens_a = self.tokenizer.tokenize(example.text_a)
        tokens_b = self.tokenizer.tokenize(example.text_b)
        if tokens_a:
            # Account for [CLS] and [SEP] with "- 2"
            tokens_a = tokens_a[0:(self.max_seq_length - 2)]

        if tokens_b:
            # Account for [CLS] and [SEP] with "- 2"
            tokens_b = tokens_b[0:(self.max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        tokens.append("[CLS]")
        for token in tokens_a:
            tokens.append(token)
        tokens.append("[SEP]")
        input_ids_a = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask_a = [1] * len(input_ids_a)

        tokens = []
        tokens.append("[CLS]")
        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
            tokens.append("[SEP]")

        input_ids_b = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask_b = [1] * len(input_ids_b)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.

        # Zero-pad up to the sequence length.
        while len(input_ids_a) < self.max_seq_length:
            input_ids_a.append(0)
            input_mask_a.append(0)

        while len(input_ids_b) < self.max_seq_length:
            input_ids_b.append(0)
            input_mask_b.append(0)

        assert len(input_ids_a) == self.max_seq_length
        assert len(input_ids_b) == self.max_seq_length

        label_id = label_map[example.label]
        if ex_index < 5:
            tf.get_logger().info("*** Example ***")
            tf.get_logger().info("guid: %s" % (example.guid))
            tf.get_logger().info("tokens: %s" % " ".join(
                [tokenization.printable_text(x) for x in tokens]))
            tf.get_logger().info("input_ids_a: %s" % " ".join([str(x) for x in input_ids_a]))
            tf.get_logger().info("input_mask_a: %s" % " ".join([str(x) for x in input_mask_a]))
            tf.get_logger().info("input_ids_b: %s" % " ".join([str(x) for x in input_ids_b]))
            tf.get_logger().info("input_mask_b: %s" % " ".join([str(x) for x in input_mask_b]))
            tf.get_logger().info("label: %s (id = %d)" % (example.label, label_id))

        feature = SiameseFeatures(
            input_ids_a=input_ids_a,
            input_mask_a=input_mask_a,
            input_ids_b=input_ids_b,
            input_mask_b=input_mask_b,
            label_id=label_id)
        return feature
def convert_single_example(ex_index, example, label_list, max_seq_length,
                           tokenizer):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        return InputFeatures(
            input_ids=[0] * max_seq_length,
            input_mask=[0] * max_seq_length,
            segment_ids=[0] * max_seq_length,
            label_id=0,
            is_real_example=False)

    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = label_map[example.label]
    if ex_index < 5:
        logging.info("*** Example ***")
        logging.info("guid: %s" % (example.guid))
        logging.info("tokens: %s" % " ".join(
            [tokenization.printable_text(x) for x in tokens]))
        logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        logging.info("label: %s (id = %d)" % (example.label, label_id))

    feature = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        is_real_example=True)
    return feature
Beispiel #6
0
    def write_instance_to_example_files(self, instances, tokenizer,
                                        max_seq_length,
                                        max_predictions_per_seq, output_files):
        """Create TF example files from `TrainingInstance`s."""
        writers = []
        for output_file in output_files:
            writers.append(tf.io.TFRecordWriter(output_file))

        writer_index = 0

        total_written = 0
        for (inst_index, instance) in enumerate(instances):
            input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
            segment_ids = list(instance.segment_ids)
            input_mask = [1] * len(instance.tokens)

            origin_input_ids_length = len(input_ids) - 2 * len(
                instance.masked_lm_positions)
            output_tokens_positions = instance.output_tokens_positions

            assert origin_input_ids_length <= max_seq_length

            #while origin_input_ids_length < max_seq_length:
            while len(input_ids) < (self.max_seq_length +
                                    2 * self.max_predictions_per_seq):
                input_ids.append(0)
                input_mask.append(0)
                output_tokens_positions.append(0)
                segment_ids.append(0)
            origin_input_ids_length = len(
                input_ids) - 2 * self.max_predictions_per_seq

            assert origin_input_ids_length == max_seq_length
            assert len(input_mask) - (
                2 * self.max_predictions_per_seq) == max_seq_length
            assert len(segment_ids) - (
                2 * self.max_predictions_per_seq) == max_seq_length
            masked_lm_positions = list(instance.masked_lm_positions)
            masked_lm_ids = tokenizer.convert_tokens_to_ids(
                instance.masked_lm_labels)
            masked_lm_weights = [1.0] * len(masked_lm_ids)
            pseudo_masked_lm_positions = list(
                instance.pseudo_masked_lm_positions)
            pseudo_masked_lm_ids = [
                tokenizer.convert_tokens_to_ids(_)
                for _ in instance.pseudo_masked_lm_labels
            ]
            pseudo_masked_index = instance.pseudo_index
            masked_index = instance.masked_index
            pseudo_masked_sub_list_len = [
                len(sub_list) for sub_list in pseudo_masked_lm_positions
            ]

            while len(masked_lm_positions) < max_predictions_per_seq:
                masked_lm_positions.append(0)
                masked_lm_ids.append(0)
                masked_lm_weights.append(0.0)
                pseudo_masked_lm_positions.append([0])
                pseudo_masked_index.append([0])
                masked_index.append(0)
                pseudo_masked_lm_ids.append([0])

            while len(pseudo_masked_sub_list_len) < max_predictions_per_seq:
                pseudo_masked_sub_list_len.append(0)

            next_sentence_label = 1 if instance.is_random_next else 0

            features = collections.OrderedDict()
            features["input_ids"] = about_tfrecord.create_int_feature(
                input_ids)
            # convert array to a byte_feature
            features["input_mask"] = about_tfrecord.create_int_feature(
                input_mask)

            flatten_pseudo_masked_lm_positions = [
                _ for sub_list in pseudo_masked_lm_positions for _ in sub_list
            ]
            #features["pseudo_masked_lm_positions"] = about_tfrecord.create_int_feature(flatten_pseudo_masked_lm_positions)
            features[
                "pseudo_masked_sub_list_len"] = about_tfrecord.create_int_feature(
                    pseudo_masked_sub_list_len)
            flatten_pseudo_masked_index = [
                _ for sub_list in pseudo_masked_index for _ in sub_list
            ]
            features[
                "pseudo_masked_index"] = about_tfrecord.create_int_feature(
                    flatten_pseudo_masked_index)
            features["masked_index"] = about_tfrecord.create_int_feature(
                masked_index)
            flatten_pseudo_masked_lm_ids = [
                _ for sub_list in pseudo_masked_lm_ids for _ in sub_list
            ]
            features[
                "pseudo_masked_lm_ids"] = about_tfrecord.create_int_feature(
                    flatten_pseudo_masked_lm_ids)
            features[
                "output_tokens_positions"] = about_tfrecord.create_int_feature(
                    output_tokens_positions)
            features["segment_ids"] = about_tfrecord.create_int_feature(
                segment_ids)
            #features["masked_lm_positions"] = about_tfrecord.create_int_feature(masked_lm_positions)
            features["masked_lm_ids"] = about_tfrecord.create_int_feature(
                masked_lm_ids)
            features[
                "masked_lm_weights"] = about_tfrecord.create_float_feature(
                    masked_lm_weights)
            features[
                "next_sentence_labels"] = about_tfrecord.create_int_feature(
                    [next_sentence_label])

            tf_example = tf.train.Example(features=tf.train.Features(
                feature=features))

            writers[writer_index].write(tf_example.SerializeToString())
            writer_index = (writer_index + 1) % len(writers)

            total_written += 1

            if inst_index < 20:
                tf.get_logger().info("*** Example ***")
                tf.get_logger().info("tokens: %s" % " ".join(
                    [tokenization.printable_text(x) for x in instance.tokens]))

                for feature_name in features.keys():
                    feature = features[feature_name]
                    values = []
                    if feature.int64_list.value:
                        values = feature.int64_list.value
                    elif feature.float_list.value:
                        values = feature.float_list.value
                    tf.get_logger().info(
                        "%s: %s" %
                        (feature_name, " ".join([str(x) for x in values])))

        for writer in writers:
            writer.close()
        tf.get_logger().info("Wrote %d total instances", total_written)