コード例 #1
0
 def __str__(self):
   s = ""
   s += "tokens: %s\n" % (" ".join(
       [tokenization.printable_text(x) for x in self.tokens]))
   s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
   s += "is_random_next: %s\n" % self.is_random_next
   s += "masked_lm_positions: %s\n" % (" ".join(
       [str(x) for x in self.masked_lm_positions]))
   s += "masked_lm_labels: %s\n" % (" ".join(
       [tokenization.printable_text(x) for x in self.masked_lm_labels]))
   s += "\n"
   return s
コード例 #2
0
ファイル: data_process.py プロジェクト: tpoljak/BERT_RESSEL
def convert_analysis_example(example, max_seq_length=320, tokenizer=None):

    tokens_a = tokenizer.tokenize(example.text_a)  # 280
    print(tokens_a)

    tokens = []
    segment_ids = []
    input_mask = []
    position_ids = []
    zero_position_id = 0

    # tokens.append("[CLS]")
    # segment_ids.append(0)
    # input_mask.append(1)
    # position_ids.append(zero_position_id)

    # text_a
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
        input_mask.append(1)
        position_ids.append(zero_position_id)

    # tokens.append("[SEP]")
    # segment_ids.append(0)
    # input_mask.append(1)
    # position_ids.append(zero_position_id)
    print(len(tokens))

    #text_a padding
    while len(tokens) < max_seq_length:
        tokens.append("[PAD]")
        segment_ids.append(0)
        input_mask.append(0)
        position_ids.append(zero_position_id)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(position_ids) == max_seq_length

    tf.logging.info("*** Example ***")
    tf.logging.info("guid: %s" % (example.guid))
    tf.logging.info("tokens: %s" %
                    " ".join([tokenization.printable_text(x) for x in tokens]))

    feature = InputFeatures(input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=0,
                            input_length=len(tokens_a),
                            position_ids=position_ids,
                            is_real_example=True)

    return feature
コード例 #3
0
def write_instance_to_example_files(instances, tokenizer, max_seq_length,
                                    max_predictions_per_seq, output_files):
  """Create TF example files from `TrainingInstance`s."""
  writers = []
  for output_file in output_files:
    writers.append(tf.python_io.TFRecordWriter(output_file))

  writer_index = 0

  total_written = 0
  tf.logging.info("Total %d instances exist" % len(instances))
  for (inst_index, instance) in enumerate(instances):
    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
    input_mask = [1] * len(input_ids)
    segment_ids = list(instance.segment_ids)
    assert len(input_ids) <= max_seq_length

    while len(input_ids) < max_seq_length:
      input_ids.append(0)
      input_mask.append(0)
      segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    masked_lm_positions = list(instance.masked_lm_positions)
    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
    masked_lm_weights = [1.0] * len(masked_lm_ids)

    while len(masked_lm_positions) < max_predictions_per_seq:
      masked_lm_positions.append(0)
      masked_lm_ids.append(0)
      masked_lm_weights.append(0.0)

    next_sentence_label = 1 if instance.is_random_next else 0

    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)
    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
    features["next_sentence_labels"] = create_int_feature([next_sentence_label])

    tf_example = tf.train.Example(features=tf.train.Features(feature=features))

    writers[writer_index].write(tf_example.SerializeToString())
    writer_index = (writer_index + 1) % len(writers)

    total_written += 1

    if inst_index % 100000 == 0:
      tf.logging.info("Now %d data has been saved as tfrecord" % inst_index)

    if inst_index < 20:
      tf.logging.info("*** Example %s***" % inst_index)
      tf.logging.info("tokens: %s" % " ".join(
          [tokenization.printable_text(x) for x in instance.tokens]))

      for feature_name in features.keys():
        feature = features[feature_name]
        values = []
        if feature.int64_list.value:
          values = feature.int64_list.value
        elif feature.float_list.value:
          values = feature.float_list.value
        tf.logging.info(
            "%s: %s" % (feature_name, " ".join([str(x) for x in values])))

  for writer in writers:
    writer.close()

  tf.logging.info("Wrote %d total instances", total_written)
コード例 #4
0
    def convert_single_example(self, ex_index, example, label_list,
                               max_seq_length, tokenizer):
        """Converts a single `InputExample` into a single `InputFeatures`."""
        label_map = {}
        for (i, label) in enumerate(label_list):
            label_map[label] = i

        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[example.label]
        if ex_index < 5:
            # tf.compat.v1.logging.info("*** Example ***")
            # tf.compat.v1.logging.info("guid: %s" % (example.guid))
            # tf.compat.v1.logging.info("tokens: %s" % " ".join(
            #     [tokenization.printable_text(x) for x in tokens]))
            # tf.compat.v1.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            # tf.compat.v1.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            # tf.compat.v1.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            # tf.compat.v1.logging.info("label: %s (id = %d)" % (example.label, label_id))
            tf.logging.info("*** Example ***")
            tf.logging.info("guid: %s" % (example.guid))
            tf.logging.info(
                "tokens: %s" %
                " ".join([tokenization.printable_text(x) for x in tokens]))
            tf.logging.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
            tf.logging.info("input_mask: %s" %
                            " ".join([str(x) for x in input_mask]))
            tf.logging.info("segment_ids: %s" %
                            " ".join([str(x) for x in segment_ids]))
            tf.logging.info("label: %s (id = %d)" % (example.label, label_id))

        feature = InputFeatures(input_ids=input_ids,
                                input_mask=input_mask,
                                segment_ids=segment_ids,
                                label_id=label_id)
        return feature
コード例 #5
0
ファイル: data_process.py プロジェクト: tpoljak/BERT_RESSEL
def convert_separate_example(ex_index,
                             example,
                             label_list,
                             max_seq_length,
                             max_seq_a,
                             max_seq_b,
                             tokenizer,
                             sentence_type="dialog"):
    if isinstance(example, PaddingInputExample):
        return InputFeatures(input_ids=[0] * max_seq_length,
                             input_mask=[0] * max_seq_length,
                             segment_ids=[0] * max_seq_length,
                             label_id=0,
                             input_length=0,
                             position_ids=[100] * max_seq_length,
                             is_real_example=False)

    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    tokens_a = tokenizer.tokenize(example.text_a)  # 280
    tokens_b = tokenizer.tokenize(example.text_b)  # 40

    # #TODO:tokens_a : how many turns in a dialog
    # from collections import Counter
    # dialog_counter = Counter(tokens_a)
    # print(dialog_counter["[EOT]"])

    # 278 + [CLS] [SEP] : 280
    while len(tokens_a) + 2 > max_seq_a:
        if sentence_type == "dialog":
            tokens_a.pop(0)
        else:
            tokens_a.pop()

    # 39 + [SEP] : 40
    while len(tokens_b) + 1 > max_seq_b:
        tokens_b.pop()

    tokens = []
    segment_ids = []
    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = []
    input_lengths = []
    position_ids = []
    zero_position_id = 200

    tokens.append("[CLS]")
    segment_ids.append(0)
    input_mask.append(1)
    position_ids.append(zero_position_id)

    # position_ids = dialog_position_id(tokens_a, position_ids, reverse=True)
    # last_position_id = position_ids[-1] + 1

    # text_a
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
        input_mask.append(1)
        position_ids.append(zero_position_id)

    # assert len(position_ids) == len(tokens)

    tokens.append("[SEP]")
    segment_ids.append(0)
    input_mask.append(1)
    input_lengths.append(len(tokens))
    position_ids.append(zero_position_id)

    #text_a padding
    while len(tokens) < max_seq_a:
        tokens.append("[PAD]")
        segment_ids.append(0)
        input_mask.append(0)
        position_ids.append(zero_position_id)

    total_tokens_a = len(tokens)
    # text_b
    for token in tokens_b:
        tokens.append(token)
        segment_ids.append(1)
        input_mask.append(1)
        # for response position(should be the last position in a dialog context)
        # position_ids.append(last_position_id)
        position_ids.append(0)

    tokens.append("[SEP]")
    segment_ids.append(1)
    input_mask.append(1)
    input_lengths.append(len(tokens) - total_tokens_a)
    position_ids.append(zero_position_id)

    # text_b padding
    while len(tokens) < max_seq_length:
        tokens.append("[PAD]")
        segment_ids.append(1)
        input_mask.append(0)
        position_ids.append(zero_position_id)
    # print(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(position_ids) == max_seq_length

    label_id = label_map[example.label]
    if ex_index < 1:
        print("*** Example ***")
        print("guid: %s" % (example.guid))
        print("tokens: %s" %
              " ".join([tokenization.printable_text(x) for x in tokens]))
        print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        print("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        print("label: %s (id = %d)" % (example.label, label_id))

    feature = InputFeatures(input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=label_id,
                            input_length=input_lengths,
                            position_ids=position_ids,
                            is_real_example=True)

    return feature
コード例 #6
0
ファイル: data_process.py プロジェクト: tpoljak/BERT_RESSEL
def convert_single_example(ex_index,
                           example,
                           label_list,
                           max_seq_length,
                           tokenizer,
                           sentence_type="dialog"):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        return InputFeatures(input_ids=[0] * max_seq_length,
                             input_mask=[0] * max_seq_length,
                             segment_ids=[0] * max_seq_length,
                             label_id=0,
                             input_length=0,
                             is_real_example=False)

    label_id = None
    if label_list:
        label_map = {}
        for (i, label) in enumerate(label_list):
            label_map[label] = i
        label_id = label_map[example.label]

    input_length = 0
    tokens_a = tokenizer.tokenize(example.text_a)

    # if dataset == "ubuntu":
    #   tokens_a = ubuntu_sep_token_append(tokens_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            input_length = max_seq_length
            if sentence_type == "dialog":
                tokens_a = tokens_a[-(max_seq_length - 2):]
            else:
                tokens_a = tokens_a[0:(max_seq_length - 2)]
        else:
            input_length = len(tokens_a) + 2

    tokens = []
    segment_ids = []

    tokens.append("[CLS]")
    segment_ids.append(0)

    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)

        sentence_length = len(tokens)

    tokens.append("[SEP]")
    segment_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)

        tokens.append("[SEP]")
        segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if ex_index < 3:
        print("*** Example ***")
        print("guid: %s" % (example.guid))
        print("tokens: %s" %
              " ".join([tokenization.printable_text(x) for x in tokens]))
        tf.logging.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
        tf.logging.info("input_mask: %s" %
                        " ".join([str(x) for x in input_mask]))
        tf.logging.info("segment_ids: %s" %
                        " ".join([str(x) for x in segment_ids]))

    feature = InputFeatures(input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=label_id,
                            input_length=input_length,
                            is_real_example=True)

    return feature