Example #1
0
    def convert(self, data):
        """Convert a 3D list to a DataSet object.

        :param data: A 3D tensor.
            Example::
                [
                    [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
                    [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
                    ...
                ]

        :return: A DataSet object.
        """

        data_set = DataSet()

        for example in data:
            p, h, l = example
            # list, list, str
            instance = Instance()
            instance.add_field("premise", p)
            instance.add_field("hypothesis", h)
            instance.add_field("truth", l)
            data_set.append(instance)
        data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len")
        data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len")
        data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len")
        data_set.set_target("truth")
        return data_set
Example #2
0
def generate_fake_dataset(num_samples=1000):
    """
    产生的DataSet包含以下的field {'1':[], '2':[], '3': [], '4':[]}
    :param num_samples: sample的数量
    :return:
    """

    max_len = 50
    min_len = 10
    num_features = 4

    data_dict = {}
    for i in range(num_features):
        data = []
        lengths = np.random.randint(min_len, max_len, size=(num_samples))
        for length in lengths:
            data.append(np.random.randint(100, size=length))
        data_dict[str(i)] = data

    dataset = DataSet(data_dict)

    for i in range(num_features):
        if np.random.randint(2) == 0:
            dataset.set_input(str(i))
        else:
            dataset.set_target(str(i))
    return dataset
Example #3
0
 def test_list_of_numpy_to_tensor(self):
     ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
                  [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         print(x, y)
Example #4
0
 def test_numpy_padding(self):
     ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                   "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     for x, y in iter:
         self.assertEqual(x["x"].shape, (4, 4))
         self.assertEqual(y["y"].shape, (4, 4))
Example #5
0
 def convert(self, data):
     data_set = DataSet()
     for item in data:
         sent_words, sent_pos_tag = item[0], item[1]
         data_set.append(Instance(words=sent_words, tags=sent_pos_tag))
     data_set.apply(lambda ins: len(ins), new_field_name="seq_len")
     data_set.set_target("tags")
     data_set.set_input("sent_words")
     data_set.set_input("seq_len")
     return data_set
Example #6
0
 def test_dataset_batching(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray))
         self.assertEqual(len(x["x"]), 4)
         self.assertEqual(len(y["y"]), 4)
         self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4])
         self.assertListEqual(list(y["y"][-1]), [5, 6])
Example #7
0
 def test_numpy_to_tensor(self):
     ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                   "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
Example #8
0
 def test_list_of_list_to_tensor(self):
     ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
                  [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
Example #9
0
 def test_ModelProcessor(self):
     from fastNLP.models.cnn_text_classification import CNNText
     model = CNNText(100, 100, 5)
     ins_list = []
     for _ in range(64):
         seq_len = np.random.randint(5, 30)
         ins_list.append(
             Instance(word_seq=[
                 np.random.randint(0, 100) for _ in range(seq_len)
             ],
                      seq_lens=seq_len))
     data_set = DataSet(ins_list)
     data_set.set_input("word_seq", "seq_lens")
     proc = ModelProcessor(model)
     data_set = proc(data_set)
     self.assertTrue("pred" in data_set)
Example #10
0
    def test_input_target(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
        ds.set_input("x")
        ds.set_target("y")
        self.assertTrue(ds.field_arrays["x"].is_input)
        self.assertTrue(ds.field_arrays["y"].is_target)

        with self.assertRaises(KeyError):
            ds.set_input("xxx")
        with self.assertRaises(KeyError):
            ds.set_input("yyy")
class CustomizedNER(object):
    def __init__(self, modelFile, vocabFile, addTarget2Vocab=False):
        # CHAR_INPUT="chars", 并且会转化为word_index
        self._vocabFile = vocabFile
        self._addTarget2Vocab = addTarget2Vocab
        self._CONST_CHAR = Const.CHAR_INPUT
        self._CONST_WORDS = Const.INPUT
        self._CONST_TARGET = Const.TARGET
        self._input_fields = [self._CONST_WORDS, Const.INPUT_LEN]
        self._word_counter, self._word_vocab, self._target_counter, \
        self._target_vocab, self._target = self._get_vocabs()
        self._vocab4word = Vocabulary()
        self._update_word()
        if self._addTarget2Vocab:
            self._vocab4target = Vocabulary(unknown=None, padding=None)
            self._input_fields.append(self._CONST_TARGET)
            self._update_target()
        self._model = Predictor(ModelLoader().load_pytorch_model(modelFile))

    def _target_token(self, word_token, cont, number="", word=""):
        ret = dict()
        sign = True
        lastIdx = len(word_token) - 1
        for num, token in zip(enumerate(word_token), cont):
            if num[1] in self._target:
                if sign:
                    number += str(num[1])
                    word += token
                    if num[0] < lastIdx and not word_token[num[0] + 1]:
                        sign = False
                else:
                    ret.setdefault(number, set())
                    ret[number].add(word)
                    number = ""
                    word = token
                    sign = True
        if number:
            ret.setdefault(number, set())
            ret[number].add(word)
        return ret

    def _extract_ner(self, tokenNum, token, weighted=False):
        if not weighted:
            cls = self._target.get(int(max(tokenNum, key=tokenNum.count)), "")
            if cls.endswith("LOC"):
                return {"LOC": [x for x in token]}
            elif cls.endswith("PER"):
                return {"PER": [x for x in token]}
            elif cls.endswith("ORG"):
                return {"ORG": [x for x in token]}

    def _get_ner(self, tokenNumber, tokenWord):
        nerDict = self._target_token(tokenNumber, tokenWord)
        ret = dict()
        for num, token in nerDict.items():
            if len(num) == 1:
                continue
            for k, v in self._extract_ner(num, token).items():
                ret.setdefault(k, list())
                ret[k].extend(v)
        return ret

    def _read_vocab(self):
        with open(self._vocabFile, "r", encoding="utf-8") as vocabIn:
            return eval(vocabIn.read())

    def _reverse_dict(self, dic):
        ret = dict()
        for key, value in dic.items():
            ret.setdefault(value, key)
        return ret

    def _tartget_label(self, dic):
        ret = self._reverse_dict(dic)
        del ret[0]
        return ret

    def _get_vocabs(self):
        vocabs = self._read_vocab()
        word_count = vocabs.get("wordsWc", dict())
        wordsVocab = vocabs.get("wordsVocab", dict())
        target_count = vocabs.get("targetWc", dict())
        targetVocab = vocabs.get("targetVocab", dict())
        reverseTargetVocab = self._tartget_label(targetVocab)
        return Counter(word_count), wordsVocab, Counter(
            target_count), targetVocab, reverseTargetVocab

    def _update_word(self):
        self._vocab4word.update(self._word_vocab)
        self._vocab4word.word_count = self._word_counter

    def _update_target(self):
        self._vocab4target.update(self._target_vocab)
        self._vocab4target.word_count = self._target_counter

    @property
    def model(self):
        if not self._model:
            raise
        return self._model

    def formatRowString(self, msg):
        msg = msg.strip()
        tokenized_char = [x for x in msg]
        self._dataset = DataSet()
        if self._addTarget2Vocab:
            ins = Instance(chars=tokenized_char,
                           raw_chars=tokenized_char,
                           target=list(dict(self._target_vocab).keys()))
        else:
            ins = Instance(chars=tokenized_char, raw_chars=tokenized_char)
        self._dataset.append(ins)

    @property
    def dataset(self):
        # if input as dict format:
        # data = DataSet({"raw_chars":[msg], "words":[[x for x in msg]], "seq_len":[len(word_list)]})
        # 从该dataset中的chars列建立词表
        self._vocab4word.from_dataset(self._dataset,
                                      field_name=self._CONST_CHAR)
        # 使用vocabulary将chars列转换为index
        self._vocab4word.index_dataset(self._dataset,
                                       field_name=self._CONST_CHAR,
                                       new_field_name=self._CONST_WORDS)
        if self._addTarget2Vocab:
            self._vocab4target.from_dataset(self._dataset,
                                            field_name=self._CONST_TARGET)
            self._vocab4target.index_dataset(self._dataset,
                                             field_name=self._CONST_TARGET)
        self._dataset.add_seq_len(self._CONST_CHAR)
        self._dataset.set_input(*self._input_fields)
        return self._dataset

    def _content(self):
        for line in self._dataset["raw_chars"].content:
            yield "".join(line)

    def result(self, dataset):
        # 打印数据集中的预测结果
        ret = self.model.predict(dataset)["pred"]
        for line, cont in zip(ret, self._content()):
            yield self._get_ner(line[0].tolist(), cont)