Example #1
0
    def convert(self, data):
        """Convert a 3D list to a DataSet object.

        :param data: A 3D tensor.
            Example::
                [
                    [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
                    [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
                    ...
                ]

        :return: A DataSet object.
        """

        data_set = DataSet()

        for example in data:
            p, h, l = example
            # list, list, str
            instance = Instance()
            instance.add_field("premise", p)
            instance.add_field("hypothesis", h)
            instance.add_field("truth", l)
            data_set.append(instance)
        data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len")
        data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len")
        data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len")
        data_set.set_target("truth")
        return data_set
Example #2
0
 def convert(self, data):
     data_set = DataSet()
     for item in data:
         sent_words, sent_pos_tag = item[0], item[1]
         data_set.append(Instance(words=sent_words, tags=sent_pos_tag))
     data_set.apply(lambda ins: len(ins), new_field_name="seq_len")
     data_set.set_target("tags")
     data_set.set_input("sent_words")
     data_set.set_input("seq_len")
     return data_set
Example #3
0
    def test_apply(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
        ds.apply(lambda ins: ins["x"][::-1], new_field_name="rx")
        self.assertTrue("rx" in ds.field_arrays)
        self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1])

        ds.apply(lambda ins: len(ins["y"]), new_field_name="y")
        self.assertEqual(ds.field_arrays["y"].content[0], 2)

        res = ds.apply(lambda ins: len(ins["x"]))
        self.assertTrue(isinstance(res, list) and len(res) > 0)
        self.assertTrue(res[0], 4)
Example #4
0
    def predict(self, content):
        if not hasattr(self, 'pipeline'):
            raise ValueError("You have to load model first.")

        # 1. 利用POS得到分词和pos tagging结果
        pos_out = self.pos_tagger.predict(content)
        # pos_out = ['这里/NN 是/VB 分词/NN 结果/NN'.split()]

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field('wp', pos_out)
        dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[0] for w in x['wp']],
                      new_field_name='words')
        dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[1] for w in x['wp']],
                      new_field_name='pos')
        dataset.rename_field("words", "raw_words")

        # 3. 使用pipeline
        self.pipeline(dataset)
        dataset.apply(lambda x: [str(arc) for arc in x['arc_pred']],
                      new_field_name='arc_pred')
        dataset.apply(lambda x: [
            arc + '/' + label
            for arc, label in zip(x['arc_pred'], x['label_pred_seq'])
        ][1:],
                      new_field_name='output')
        # output like: [['2/top', '0/root', '4/nn', '2/dep']]
        return dataset.field_arrays['output'].content
Example #5
0
 def convert(self, data):
     data_set = DataSet()
     for item in data:
         sent_words = item[0]
         if self.pos is True and self.ner is True:
             instance = Instance(words=sent_words,
                                 pos_tags=item[1],
                                 ner=item[2])
         elif self.pos is True:
             instance = Instance(words=sent_words, pos_tags=item[1])
         elif self.ner is True:
             instance = Instance(words=sent_words, ner=item[1])
         else:
             instance = Instance(words=sent_words)
         data_set.append(instance)
     data_set.apply(lambda ins: len(ins["words"]), new_field_name="seq_len")
     return data_set
Example #6
0
    def predict(self, content):
        """

        :param content: list of list of str. Each string is a token(word).
        :return answer: list of list of str. Each string is a tag.
        """
        if not hasattr(self, "pipeline"):
            raise ValueError("You have to load model first.")

        sentence_list = []
        # 1. 检查sentence的类型
        if isinstance(content, str):
            sentence_list.append(content)
        elif isinstance(content, list):
            sentence_list = content

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field("words", sentence_list)

        # 3. 使用pipeline
        self.pipeline(dataset)

        def decode_tags(ins):
            pred_tags = ins["tag"]
            chars = ins["words"]
            words = []
            start_idx = 0
            for idx, tag in enumerate(pred_tags):
                if tag[0] == "S":
                    words.append(chars[start_idx:idx + 1] + "/" + tag[2:])
                    start_idx = idx + 1
                elif tag[0] == "E":
                    words.append("".join(chars[start_idx:idx + 1]) + "/" +
                                 tag[2:])
                    start_idx = idx + 1
            return words

        dataset.apply(decode_tags, new_field_name="tag_output")

        output = dataset.field_arrays["tag_output"].content
        if isinstance(content, str):
            return output[0]
        elif isinstance(content, list):
            return output
Example #7
0
 def test_BucketSampler(self):
     sampler = BucketSampler(num_buckets=3, batch_size=16, seq_lens_field_name="seq_len")
     data_set = DataSet({"x": [[0] * random.randint(1, 10)] * 10, "y": [[5, 6]] * 10})
     data_set.apply(lambda ins: len(ins["x"]), new_field_name="seq_len")
     indices = sampler(data_set)
     self.assertEqual(len(indices), 10)
Example #8
0
 def test_apply(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
     ds.apply(lambda ins: ins["x"][::-1], new_field_name="rx")
     self.assertTrue("rx" in ds.field_arrays)
     self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1])