Example #1
0
def input_from_line(line, max_seq_length, tag_to_id):
    """
    Take sentence data and return an input for
    the training or the evaluation function.
    """
    string = [w[0].strip() for w in line]
    # chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>']
    #         for w in string]
    char_line = ' '.join(string)  # 使用空格把汉字拼起来
    text = tokenization.convert_to_unicode(char_line)

    tags = ['O' for _ in string]

    labels = ' '.join(tags)  # 使用空格把标签拼起来
    labels = tokenization.convert_to_unicode(labels)

    ids, mask, segment_ids, label_ids = convert_single_example(char_line=text,
                                                               tag_to_id=tag_to_id,
                                                               max_seq_length=max_seq_length,
                                                               tokenizer=tokenizer,
                                                               label_line=labels)
    import numpy as np
    segment_ids = np.reshape(segment_ids,(1, max_seq_length))
    ids = np.reshape(ids, (1, max_seq_length))
    mask = np.reshape(mask, (1, max_seq_length))
    label_ids = np.reshape(label_ids, (1, max_seq_length))
    return [string, segment_ids, ids, mask, label_ids]
Example #2
0
def prepare_dataset(sentences, max_seq_length, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    def f(x):
        return x.lower() if lower else x
    data = []
    for s in sentences:
        string = [w[0].strip() for w in s]
        #chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>']
        #         for w in string]
        char_line = ' '.join(string)   #使用空格把汉字拼起来
        text = tokenization.convert_to_unicode(char_line)

        if train:
            tags = [w[-1] for w in s]
        else:
            tags = ['O' for _ in string]

        labels = ' '.join(tags)     #使用空格把标签拼起来
        labels = tokenization.convert_to_unicode(labels)

        ids, mask, segment_ids, label_ids = convert_single_example(char_line=text,
                                                                   tag_to_id=tag_to_id,
                                                                   max_seq_length=max_seq_length,
                                                                   tokenizer=tokenizer,
                                                                   label_line=labels)
        data.append([string, segment_ids, ids, mask, label_ids])

    return data
 def get_test_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(os.path.join(curr_path, data_dir))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "dev-%d" % (i)
         text_a = tokenization.convert_to_unicode(line[2])
         text_b = tokenization.convert_to_unicode(line[3])
         label = tokenization.convert_to_unicode("0")
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Example #4
0
 def _to_example(sentences):
     import re
     """
     sentences to InputExample
     :param sentences: list of strings
     :return: list of InputExample
     """
     unique_id = 0
     for ss in sentences:
         line = tokenization.convert_to_unicode(ss)
         if not line:
             continue
         line = line.strip()
         text_a = None
         text_b = None
         m = re.match(r"^(.*) \|\|\| (.*)$", line)
         if m is None:
             text_a = line
         else:
             text_a = m.group(1)
             text_b = m.group(2)
         yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
         unique_id += 1