Example #1
0
 def _create_examples(lines, set_type):
     r"""Creates examples for the training and dev sets."""
     examples = []
     if set_type in ('train', 'dev'):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
             guid = f"{set_type}-{i}"
             text_a = tokenization.convert_to_unicode(line[0])
             # Single sentence classification, text_b doesn't exist
             text_b = None
             label = tokenization.convert_to_unicode(line[1])
             examples.append(
                 InputExample(guid=guid,
                              text_a=text_a,
                              text_b=text_b,
                              label=label))
     if set_type == 'test':
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
             guid = f"{set_type}-{i}"
             text_a = tokenization.convert_to_unicode(line[1])
             # Single sentence classification, text_b doesn't exist
             text_b = None
             label = '0'  # arbitrary set as 0
             examples.append(
                 InputExample(guid=guid,
                              text_a=text_a,
                              text_b=text_b,
                              label=label))
     return examples
Example #2
0
 def _create_example(self, lines, set_type):
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text = tokenization.convert_to_unicode(line[1])
         labels = tokenization.convert_to_unicode(line[0])
         examples.append(InputExample(guid=guid, text=text, labels=labels))
     return examples
Example #3
0
 def _create_examples(self, lines, set_type):
     """create examples for training and test"""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         if set_type != 'test':
             text_a = tokenization.convert_to_unicode(line[1])
             label = tokenization.convert_to_unicode(line[0])
         else:
             text_a = tokenization.convert_to_unicode(line[1])
             label = "health"
         examples.append(InputExample(guid=guid, text_a=text_a,
                                      label=label))
     return examples
Example #4
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type,
                           tokenization.convert_to_unicode("0000"))
         text_a = tokenization.convert_to_unicode(line[1])
         text_b = tokenization.convert_to_unicode(line[2])
         if set_type == "test":
             label = "0"
         else:
             label = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(
                 guid=guid, text_a=text_a, text_b=text_b, label=label))
     return examples
 def _create_examples(lines, set_type):
     r"""Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = f"{set_type}-{i}"
         text_a = tokenization.convert_to_unicode(line[3])
         text_b = tokenization.convert_to_unicode(line[4])
         if set_type == "test":
             label = "0"
         else:
             label = tokenization.convert_to_unicode(line[0])
         examples.append(InputExample(guid=guid, text_a=text_a,
                                      text_b=text_b, label=label))
     return examples
 def _create_examples(lines, set_type):
     r"""Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # Only the test set has a header
         if set_type == "test" and i == 0:
             continue
         guid = f"{set_type}-{i}"
         if set_type == "test":
             text_a = tokenization.convert_to_unicode(line[1])
             label = "0"
         else:
             text_a = tokenization.convert_to_unicode(line[3])
             label = tokenization.convert_to_unicode(line[1])
         examples.append(InputExample(guid=guid, text_a=text_a,
                                      text_b=None, label=label))
     return examples
 def get_dev_examples(self, data_dir):
     r"""See base class."""
     lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = f"dev-{i}"
         language = tokenization.convert_to_unicode(line[0])
         if language != tokenization.convert_to_unicode(self.language):
             continue
         text_a = tokenization.convert_to_unicode(line[6])
         text_b = tokenization.convert_to_unicode(line[7])
         label = tokenization.convert_to_unicode(line[1])
         examples.append(InputExample(guid=guid, text_a=text_a,
                                      text_b=text_b, label=label))
     return examples
 def get_train_examples(self, data_dir):
     r"""See base class."""
     lines = self._read_tsv(
         os.path.join(data_dir, "multinli",
                      f"multinli.train.{self.language}.tsv"))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = f"train-{i}"
         text_a = tokenization.convert_to_unicode(line[0])
         text_b = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[2])
         if label == tokenization.convert_to_unicode("contradictory"):
             label = tokenization.convert_to_unicode("contradiction")
         examples.append(InputExample(guid=guid, text_a=text_a,
                                      text_b=text_b, label=label))
     return examples
Example #9
0
 def _read_numerical_file(self, fp, delimiter=";"):
     for i, line in enumerate(fp):
         cols = tokenization.convert_to_unicode(line).strip().split(delimiter)
         cols = list(map(lambda x: list(map(int, x.split(" "))), cols))
         if len(cols) > self.num_numerical_fields:
             cols = cols[:self.num_numerical_fields]
         tgt_start_idx = cols[0].index(self.bos_id, 1)
         record = self.Record(*cols, tgt_start_idx=tgt_start_idx, data_id=i)
         yield record
Example #10
0
def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    for sent in input_file:
        sent = ' '.join(w for w in sent)
        line = tokenization.convert_to_unicode(sent)
        line = line.strip()
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        #print(text_a)
        examples.append(
            InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
        unique_id += 1

    return examples
Example #11
0
                        default="",
                        metavar='STRING',
                        help='input file path')
    parser.add_argument('--vocab_file',
                        type=str,
                        default="",
                        metavar='STRING',
                        help='vocab file path')
    args = parser.parse_args()

    input_file = args.input_file
    max_seq_length = args.max_seq_length
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=True)
    with tf.gfile.Open(input_file, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        for line in reader:
            text_a = tokenization.convert_to_unicode(line[3])
            text_b = tokenization.convert_to_unicode(line[4])

            a_input_ids, a_input_mask, a_segment_ids = convert_single_example(
                text_a,
                None,
                max_seq_length=max_seq_length,
                tokenizer=tokenizer)
            b_input_ids, b_input_mask, b_segment_ids = convert_single_example(
                text_b,
                None,
                max_seq_length=max_seq_length,
                tokenizer=tokenizer)
Example #12
0
def convert_single_example_to_unicode(guid, single_example):
    text_a = tokenization.convert_to_unicode(single_example[0])
    text_b = tokenization.convert_to_unicode(single_example[1])
    label = tokenization.convert_to_unicode(single_example[2])
    return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
Example #13
0
def convert_examples_to_features(examples,
                                 label_list,
                                 seq_length,
                                 tokenizer,
                                 trunc_keep_right,
                                 data_stats=None,
                                 aug_ops=None):
    """convert examples to features."""

    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    tf.logging.info("number of examples to process: {}".format(len(examples)))

    features = []

    if aug_ops:
        tf.logging.info("building vocab")
        word_vocab = build_vocab(examples)
        examples = word_level_augment.word_level_augment(
            examples, aug_ops, word_vocab, data_stats)

    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.logging.info("processing {:d}".format(ex_index))
        tokens_a = tokenizer.tokenize_to_wordpiece(example.word_list_a)
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize_to_wordpiece(example.word_list_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            if trunc_keep_right:
                _truncate_seq_pair_keep_right(tokens_a, tokens_b,
                                              seq_length - 3)
            else:
                _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > seq_length - 2:
                if trunc_keep_right:
                    tokens_a = tokens_a[-(seq_length - 2):]
                else:
                    tokens_a = tokens_a[0:(seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        input_type_ids = []
        tokens.append("[CLS]")
        input_type_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            input_type_ids.append(0)
        tokens.append("[SEP]")
        input_type_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                input_type_ids.append(1)
            tokens.append("[SEP]")
            input_type_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)

        assert len(input_ids) == seq_length
        assert len(input_mask) == seq_length
        assert len(input_type_ids) == seq_length

        label_id = label_map[example.label]
        if ex_index < 1:
            tf.logging.info("*** Example ***")
            tf.logging.info("guid: %s" % (example.guid))
            # st = " ".join([str(x) for x in tokens])
            st = ""
            for x in tokens:
                st += tokenization.convert_to_unicode(x)
            tf.logging.info("tokens: %s" % st)
            tf.logging.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
            tf.logging.info("input_mask: %s" %
                            " ".join([str(x) for x in input_mask]))
            tf.logging.info("input_type_ids: %s" %
                            " ".join([str(x) for x in input_type_ids]))
            tf.logging.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
            InputFeatures(input_ids=input_ids,
                          input_mask=input_mask,
                          input_type_ids=input_type_ids,
                          label_id=label_id))
    return features
Example #14
0
    def _convert_example_to_record(self, example, is_infer):
        # process src
        src_token_ids = []
        src_pos_ids = []
        if self.use_role:
            src_role_ids = []
            role_id_list = []

        # tokenize src
        s_token_ids_list = []
        for s in example.src.split("[SEP]"):
            s = tokenization.convert_to_unicode(s).strip()
            if self.use_role:
                s, role_id = s.split("\1")
                role_id = int(role_id)
                role_id_list.append(role_id)

            if self.data_format == "tokenized":
                s_tokens = s.split(" ")
            else:
                s_tokens = self.tokenizer.tokenize(s)

            s_token_ids = self.tokenizer.convert_tokens_to_ids(s_tokens) + [
                self.eos_id
            ]
            s_token_ids_list.append(s_token_ids)

        # trim src
        idx = len(s_token_ids_list) - 1
        total_token_num = 1
        while idx >= 0:
            total_token_num += len(s_token_ids_list[idx])
            if total_token_num > self.max_src_len:
                if self.truncate_first_turn and idx == 0:
                    truncated_ids = s_token_ids_list[idx][:self.max_src_len -
                                                          total_token_num]
                    if len(truncated_ids) > 1:
                        s_token_ids_list[idx] = truncated_ids[:-1] + [
                            self.eos_id
                        ]
                        idx -= 1
                break
            idx -= 1

        for i, s_token_ids in enumerate(s_token_ids_list[idx + 1:], idx + 1):
            src_token_ids += s_token_ids
            src_pos_ids += list(range(1, len(s_token_ids) + 1))
            if self.use_role:
                src_role_ids += [role_id_list[i]] * len(s_token_ids)

        src_token_ids = [self.bos_id] + src_token_ids
        src_type_ids = [0] * len(src_token_ids)
        src_pos_ids = [0] + src_pos_ids
        if self.use_role:
            src_role_ids = [0] + src_role_ids
        assert len(src_token_ids) == len(src_type_ids) == len(src_pos_ids), \
            "not len(src_token_ids) == len(src_type_ids) == len(src_pos_ids)"

        token_ids = src_token_ids
        type_ids = src_type_ids
        pos_ids = src_pos_ids
        if self.use_role:
            role_ids = src_role_ids
        tgt_start_idx = len(token_ids)

        if not is_infer:
            # process tgt
            # tokenize tgt
            tgt = tokenization.convert_to_unicode(example.tgt).strip()
            if self.data_format == "tokenized":
                tgt_tokens = tgt.split(" ")
            else:
                tgt_tokens = self.tokenizer.tokenize(tgt)

            tgt_token_ids = self.tokenizer.convert_tokens_to_ids(tgt_tokens)
            tgt_token_ids.append(self.eos_id)

            # trim tgt
            if len(tgt_token_ids) > self.max_tgt_len - 1:
                tgt_token_ids = tgt_token_ids[:self.max_tgt_len - 1]

            tgt_token_ids = [self.bos_id] + tgt_token_ids
            tgt_type_ids = [1] * len(tgt_token_ids)
            tgt_pos_ids = list(range(1, len(tgt_token_ids) + 1))
            if self.use_role:
                tgt_role_ids = [0] * len(tgt_token_ids)
            assert len(tgt_token_ids) == len(tgt_type_ids) == len(tgt_pos_ids), \
                "not len(tgt_token_ids) == len(tgt_type_ids) == len(tgt_pos_ids)"

            token_ids += tgt_token_ids
            type_ids += tgt_type_ids
            pos_ids += tgt_pos_ids
            if self.use_role:
                role_ids += tgt_role_ids

        assert len(token_ids) == len(type_ids) == len(pos_ids), \
            "not len(token_ids) == len(type_ids) == len(pos_ids)"

        if self.continuous_position:
            src_pos_ids = list(range(len(src_token_ids)))
            if not is_infer:
                tgt_pos_ids = list(range(len(tgt_token_ids)))
            pos_ids = list(range(len(token_ids)))

        field_values = {
            "token_ids": src_token_ids,
            "type_ids": src_type_ids,
            "pos_ids": src_pos_ids
        }
        if self.use_role:
            field_values["role_ids"] = role_ids
        field_values["tgt_start_idx"] = tgt_start_idx
        field_values["data_id"] = example.data_id

        record = self.Record(**field_values)
        return record
Example #15
0
    def _convert_example_to_record(self, example, max_seq_length, max_ent_cnt, tokenizer):
        input_tokens = []
        tok_to_sent = []
        tok_to_word = []
        for sent_idx, sent in enumerate(example.sents):
            for word_idx, word in enumerate(sent):
                word = tokenization.convert_to_unicode(word)
                tokens_tmp = tokenizer.tokenize(word)
                input_tokens += tokens_tmp
                tok_to_sent += [sent_idx] * len(tokens_tmp)
                tok_to_word += [word_idx] * len(tokens_tmp)

        if len(input_tokens) <= max_seq_length - 2:
            input_tokens = ['[CLS]'] + input_tokens + ['[SEP]']
            tok_to_sent = [None] + tok_to_sent + [None]
            tok_to_word = [None] + tok_to_word + [None]
            input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
            input_mask = [1] * len(input_ids)
            text_type_ids = [0] * len(input_ids)
            position_ids = list(range(len(input_ids)))
            # padding
            padding = [None] * (max_seq_length - len(input_ids))
            tok_to_sent += padding
            tok_to_word += padding
            padding = [0] * (max_seq_length - len(input_ids))
            input_mask += padding
            text_type_ids += padding
            padding = [0] * (max_seq_length - len(input_ids))
            input_ids += padding
            position_ids += padding

        else:
            input_tokens = input_tokens[:max_seq_length - 2]
            tok_to_sent = tok_to_sent[:max_seq_length - 2]
            tok_to_word = tok_to_word[:max_seq_length - 2]
            input_tokens = ['[CLS]'] + input_tokens + ['[SEP]']
            tok_to_sent = [None] + tok_to_sent + [None]
            tok_to_word = [None] + tok_to_word + [None]
            input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
            input_mask = [1] * len(input_ids)
            text_type_ids = [0] * len(input_ids)
            position_ids = list(range(len(input_ids)))

        # ent_mask & ner / coreference feature
        ent_mask = np.zeros((max_ent_cnt, max_seq_length), dtype='int64')
        ent_ner = [0] * max_seq_length
        ent_pos = [0] * max_seq_length
        tok_to_ent = [-1] * max_seq_length
        ents = example.vertexSet
        for ent_idx, ent in enumerate(ents):
            for mention in ent:
                for tok_idx in range(len(input_ids)):
                    if tok_to_sent[tok_idx] == mention['sent_id'] \
                            and mention['pos'][0] <= tok_to_word[tok_idx] < mention['pos'][1]:
                        ent_mask[ent_idx][tok_idx] = 1
                        ent_ner[tok_idx] = self.ner_map[ent[0]['type']]
                        ent_pos[tok_idx] = ent_idx + 1
                        tok_to_ent[tok_idx] = ent_idx

        # distance feature
        ent_first_appearance = [0] * max_ent_cnt
        ent_distance = np.zeros((max_ent_cnt, max_ent_cnt), dtype='int64')  # padding id is 10
        for i in range(len(ents)):
            if np.all(ent_mask[i] == 0):
                continue
            else:
                ent_first_appearance[i] = np.where(ent_mask[i] == 1)[0][0]
        for i in range(len(ents)):
            for j in range(len(ents)):
                if ent_first_appearance[i] != 0 and ent_first_appearance[j] != 0:
                    if ent_first_appearance[i] >= ent_first_appearance[j]:
                        ent_distance[i][j] = self.distance_buckets[ent_first_appearance[i] - ent_first_appearance[j]]
                    else:
                        ent_distance[i][j] = - self.distance_buckets[- ent_first_appearance[i] + ent_first_appearance[j]]
        ent_distance += 10  # norm from [-9, 9] to [1, 19]

        # structure prior for attentive biase
        # PRIOR DEFINITION  | share ent context |   diff ent context |    No ent
        # share sem context |    intra-coref    |    intra-relate    |    intra-NA
        # diff sem context  |    inter-coref    |    inter-relate    |
        structure_mask = np.zeros((5, max_seq_length, max_seq_length), dtype='float')
        for i in range(max_seq_length):
            if input_mask[i] == 0:
                break
            else:
                if tok_to_ent[i] != -1:
                    for j in range(max_seq_length):
                        if tok_to_sent[j] is None:
                            continue
                        #  intra
                        if tok_to_sent[j] == tok_to_sent[i]:
                            # intra-coref
                            if tok_to_ent[j] == tok_to_ent[i]:
                                structure_mask[0][i][j] = 1
                            # intra-relate
                            elif tok_to_ent[j] != -1:
                                structure_mask[1][i][j] = 1
                            # intra-NA
                            else:
                                structure_mask[2][i][j] = 1
                        else:
                            # inter-coref
                            if tok_to_ent[j] == tok_to_ent[i]:
                                structure_mask[3][i][j] = 1
                            # inter-relate
                            elif tok_to_ent[j] != -1:
                                structure_mask[4][i][j] = 1

        # label
        label_ids = np.zeros((max_ent_cnt, max_ent_cnt, len(self.label_map.keys())), dtype='int64')
        # test file does not have "labels"
        if example.labels is not None:
            labels = example.labels
            for label in labels:
                label_ids[label['h']][label['t']][self.label_map[label['r']]] = 1
        for h in range(len(ents)):
            for t in range(len(ents)):
                if np.all(label_ids[h][t] == 0):
                    label_ids[h][t][0] = 1

        label_mask = np.zeros((max_ent_cnt, max_ent_cnt), dtype='int64')
        label_mask[:len(ents), :len(ents)] = 1
        for ent in range(len(ents)):
            label_mask[ent][ent] = 0
        for ent in range(len(ents)):
            if np.all(ent_mask[ent] == 0):
                label_mask[ent, :] = 0
                label_mask[:, ent] = 0

        ent_mask = self.norm_mask(ent_mask)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(text_type_ids) == max_seq_length
        assert len(position_ids) == max_seq_length
        assert ent_mask.shape == (max_ent_cnt, max_seq_length)
        assert label_ids.shape == (max_ent_cnt, max_ent_cnt, len(self.label_map.keys()))
        assert label_mask.shape == (max_ent_cnt, max_ent_cnt)
        assert len(ent_ner) == max_seq_length
        assert len(ent_pos) == max_seq_length
        assert ent_distance.shape == (max_ent_cnt, max_ent_cnt)
        assert structure_mask.shape == (5, max_seq_length, max_seq_length)

        input_ids = np.expand_dims(input_ids, axis=-1).astype('int64')
        input_mask = np.expand_dims(input_mask, axis=-1).astype('int64')
        text_type_ids = np.expand_dims(text_type_ids, axis=-1).astype('int64')
        position_ids = np.expand_dims(position_ids, axis=-1).astype('int64')
        ent_ner = np.expand_dims(ent_ner, axis=-1).astype('int64')
        ent_pos = np.expand_dims(ent_pos, axis=-1).astype('int64')
        ent_distance = np.expand_dims(ent_distance, axis=-1).astype('int64')

        Record = namedtuple(
            'Record',
            ['token_ids', 'input_mask', 'text_type_ids', 'position_ids', 'ent_mask', 'label_ids',
             'label_mask', 'ent_ner', 'ent_pos', 'ent_distance', 'structure_mask'])
        record = Record(
            token_ids=input_ids,
            input_mask=input_mask,
            text_type_ids=text_type_ids,
            position_ids=position_ids,
            ent_mask=ent_mask,
            label_ids=label_ids,
            label_mask=label_mask,
            ent_ner=ent_ner,
            ent_pos=ent_pos,
            ent_distance=ent_distance,
            structure_mask=structure_mask)
        return record
Example #16
0
 def process_text(self, text):
   if self.use_spm:
     return tokenization.preprocess_text(text, lower=self.do_lower_case)
   else:
     return tokenization.convert_to_unicode(text)