Exemple #1
0
    def test_init(self):
        fields = {"x": [1, 2, 3], "y": [4, 5, 6]}
        ins = Instance(x=[1, 2, 3], y=[4, 5, 6])
        self.assertTrue(isinstance(ins.fields, dict))
        self.assertEqual(ins.fields, fields)

        ins = Instance(**fields)
        self.assertEqual(ins.fields, fields)
Exemple #2
0
 def convert_for_infer(self, data, vocabs):
     for word_seq in data:
         # list
         x = TextField(word_seq, is_target=False)
         instance = Instance()
         instance.add_field("word_seq", x)
         self.append(instance)
     self.index_field("word_seq", vocabs["word_vocab"])
Exemple #3
0
 def test_list_of_numpy_to_tensor(self):
     ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
                  [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         print(x, y)
 def formatRowString(self, msg):
     msg = msg.strip()
     tokenized_char = [x for x in msg]
     self._dataset = DataSet()
     if self._addTarget2Vocab:
         ins = Instance(chars=tokenized_char,
                        raw_chars=tokenized_char,
                        target=list(dict(self._target_vocab).keys()))
     else:
         ins = Instance(chars=tokenized_char, raw_chars=tokenized_char)
     self._dataset.append(ins)
Exemple #5
0
 def test_list_of_list_to_tensor(self):
     ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
                  [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
        def prepare_fake_dataset():
            mean = np.array([-3, -3])
            cov = np.array([[1, 0], [0, 1]])
            class_A = np.random.multivariate_normal(mean, cov, size=(1000,))

            mean = np.array([3, 3])
            cov = np.array([[1, 0], [0, 1]])
            class_B = np.random.multivariate_normal(mean, cov, size=(1000,))

            data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
                               [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
            return data_set
Exemple #7
0
    def convert(self, data):
        """Convert a 3D list to a DataSet object.

        :param data: A 3D tensor.
            Example::
                [
                    [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
                    [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
                    ...
                ]

        :return: A DataSet object.
        """

        data_set = DataSet()

        for example in data:
            p, h, l = example
            # list, list, str
            instance = Instance()
            instance.add_field("premise", p)
            instance.add_field("hypothesis", h)
            instance.add_field("truth", l)
            data_set.append(instance)
        data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len")
        data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len")
        data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len")
        data_set.set_target("truth")
        return data_set
Exemple #8
0
    def test(self):
        data = DataSet()
        for text, label in zip(texts, labels):
            x = TextField(text, is_target=False)
            y = LabelField(label, is_target=True)
            ins = Instance(text=x, label=y)
            data.append(ins)

        # use vocabulary to index data
        data.index_field("text", vocab)

        # define naive sampler for batch class
        class SeqSampler:
            def __call__(self, dataset):
                return list(range(len(dataset)))

        # use batch to iterate dataset
        data_iterator = Batch(data, 2, SeqSampler(), False)
        total_data = 0
        for batch_x, batch_y in data_iterator:
            total_data += batch_x["text"].size(0)
            self.assertTrue(batch_x["text"].size(0) == 2
                            or total_data == len(raw_texts))
            self.assertTrue(isinstance(batch_x, dict))
            self.assertTrue(isinstance(batch_x["text"], torch.LongTensor))
            self.assertTrue(isinstance(batch_y, dict))
            self.assertTrue(isinstance(batch_y["label"], torch.LongTensor))
Exemple #9
0
    def test_case_1(self):
        args = {
            "epochs": 3,
            "batch_size": 2,
            "validate": False,
            "use_cuda": False,
            "pickle_path": "./save/",
            "save_best_dev": True,
            "model_name": "default_model_name.pkl",
            "loss": Loss("cross_entropy"),
            "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0),
            "vocab_size": 10,
            "word_emb_dim": 100,
            "rnn_hidden_units": 100,
            "num_classes": 5,
            "evaluator": SeqLabelEvaluator()
        }
        trainer = SeqLabelTrainer(**args)

        train_data = [
            [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']],
            [['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
        ]
        vocab = {
            'a': 0,
            'b': 1,
            'c': 2,
            'd': 3,
            'e': 4,
            '!': 5,
            '@': 6,
            '#': 7,
            '$': 8,
            '?': 9
        }
        label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4}

        data_set = DataSet()
        for example in train_data:
            text, label = example[0], example[1]
            x = TextField(text, False)
            x_len = LabelField(len(text), is_target=False)
            y = TextField(label, is_target=False)
            ins = Instance(word_seq=x, truth=y, word_seq_origin_len=x_len)
            data_set.append(ins)

        data_set.index_field("word_seq", vocab)
        data_set.index_field("truth", label_vocab)

        model = SeqLabeling(args)

        trainer.train(network=model, train_data=data_set, dev_data=data_set)
        # If this can run, everything is OK.

        os.system("rm -rf save")
        print("pickle path deleted")
Exemple #10
0
    def __getitem__(self, idx):
        """Fetch Instance(s) at the `idx` position(s) in the dataset.
        Notice: This method returns a copy of the actual instance(s). Any change to the returned value would not modify
        the origin instance(s) of the DataSet.
        If you want to make in-place changes to all Instances, use `apply` method.

        :param idx: can be int or slice.
        :return: If `idx` is int, return an Instance object.
                If `idx` is slice, return a DataSet object.
        """
        if isinstance(idx, int):
            return Instance(**{
                name: self.field_arrays[name][idx]
                for name in self.field_arrays
            })
        elif isinstance(idx, slice):
            if idx.start is not None and (idx.start >= len(self)
                                          or idx.start <= -len(self)):
                raise RuntimeError(
                    f"Start index {idx.start} out of range 0-{len(self)-1}")
            data_set = DataSet()
            for field in self.field_arrays.values():
                data_set.add_field(name=field.name,
                                   fields=field.content[idx],
                                   padding_val=field.padding_val,
                                   is_input=field.is_input,
                                   is_target=field.is_target)
            return data_set
        else:
            raise KeyError(
                "Unrecognized type {} for idx in __getitem__ method".format(
                    type(idx)))
Exemple #11
0
    def load(self, path):
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        data = [self.get_one(sample) for sample in datalist]
        data_list = list(filter(lambda x: x is not None, data))

        ds = DataSet()
        for example in data_list:
            ds.append(
                Instance(words=example[0],
                         pos_tags=example[1],
                         heads=example[2],
                         labels=example[3]))
        return ds
Exemple #12
0
    def _load(self, path):
        ds = DataSet()
        for idx, data in _read_conll(path,
                                     indexes=self.indexes,
                                     dropna=self.dropna):
            #            if data[0][0] == '#':
            #                data[0] = data[0][1:]
            #                data[1] = data[1][1:]
            for i in range(len(self.headers)):
                if data[i][0].startswith('NE-'):
                    data[i] = data[i][1:]
                if 'TOKEN' in data[i][0]:
                    data[i] = data[i][1:]

            # print(data) #data[1] = iob(list(data[1]))
            doc_start = False
            for i, h in enumerate(self.headers):
                field = data[i]
                if str(' '.join(list(field))).startswith(' #'):
                    continue
                if str(field[0]).startswith('-DOCSTART-'):
                    doc_start = True
                    break
            if doc_start:
                continue
            ins = {h: data[i] for i, h in enumerate(self.headers)}
            ds.append(Instance(**ins))
        if len(ds) == 0:
            raise RuntimeError("No data found {}.".format(path))
        return ds
Exemple #13
0
 def _load(self, path: str = None):
     logging.info(path)
     ds = DataSet()
     with open(path, 'r', encoding='utf-8') as f:
         for line in f:
             if line == '': continue
             splits = line.strip().split('\t')
             if len(splits) == 4:
                 raw_targets = [int(i) for i in splits[3].strip().lstrip('[').rstrip(']').split(' ')]
             elif len(splits) == 3:
                 raw_targets = [0, 0, 0, 0, 0]
             else:
                 logging.error('data format error')
             raw_query = splits[0]
             raw_entity = splits[1]
             left_context = raw_query[0:raw_query.find(raw_entity)]
             right_context = raw_query[raw_query.find(raw_entity) + len(raw_entity):]
             if left_context == '': left_context = '-'
             if right_context == '': right_context = '-'
             raw_entity_label = splits[2]
             if left_context and right_context and raw_entity and raw_entity_label:
                 ds.append(
                     Instance(left_context=tokenize(left_context),
                              right_context=tokenize(right_context),
                              raw_entity=tokenize(raw_entity),
                              raw_entity_label=entity_label_tokenize(raw_entity_label),
                              target=raw_targets))
     return ds
Exemple #14
0
 def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
     if in_word_splitter is None:
         in_word_splitter = self.in_word_splitter
     dataset = DataSet()
     with open(filepath, 'r') as f:
         words = []
         for line in f:
             line = line.strip()
             if len(line) == 0:  # new line
                 if len(words) == 0:  # 不能接受空行
                     continue
                 line = ' '.join(words)
                 if cut_long_sent:
                     sents = cut_long_sentence(line)
                 else:
                     sents = [line]
                 for sent in sents:
                     instance = Instance(raw_sentence=sent)
                     dataset.append(instance)
                 words = []
             else:
                 line = line.split()[0]
                 if in_word_splitter is None:
                     words.append(line)
                 else:
                     words.append(line.split(in_word_splitter)[0])
     return dataset
Exemple #15
0
 def test_append(self):
     dd = DataSet()
     for _ in range(3):
         dd.append(Instance(x=[1, 2, 3, 4], y=[5, 6]))
     self.assertEqual(len(dd), 3)
     self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3, 4]] * 3)
     self.assertEqual(dd.field_arrays["y"].content, [[5, 6]] * 3)
Exemple #16
0
    def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
        """
        允许使用的情况有(默认以\t或空格作为seg)
            这是 fastNLP , 一个 非常 good 的 包 .
        和
            也/D  在/P  團員/Na  之中/Ng  ,/COMMACATEGORY
        如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]
        :param filepath:
        :param in_word_splitter:
        :return:
        """
        if in_word_splitter == None:
            in_word_splitter = self.in_word_splitter
        dataset = DataSet()
        with open(filepath, 'r') as f:
            for line in f:
                line = line.strip()
                if len(line.replace(' ', '')) == 0:  # 不能接受空行
                    continue

                if not in_word_splitter is None:
                    words = []
                    for part in line.split():
                        word = part.split(in_word_splitter)[0]
                        words.append(word)
                        line = ' '.join(words)
                if cut_long_sent:
                    sents = cut_long_sentence(line)
                else:
                    sents = [line]
                for sent in sents:
                    instance = Instance(raw_sentence=sent)
                    dataset.append(instance)

        return dataset
Exemple #17
0
    def load(self, path, cut_long_sent=False):
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet()
        for sample in datalist:
            # print(sample)
            res = self.get_one(sample)
            if res is None:
                continue
            line = '  '.join(res)
            if cut_long_sent:
                sents = cut_long_sentence(line)
            else:
                sents = [line]
            for raw_sentence in sents:
                ds.append(Instance(raw_sentence=raw_sentence))

        return ds
Exemple #18
0
    def load(self, path):
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet(name='conll')
        for sample in datalist:
            # print(sample)
            res = self.get_one(sample)
            ds.append(
                Instance(word_seq=TextField(res[0], is_target=False),
                         pos_seq=TextField(res[1], is_target=False),
                         head_indices=SeqLabelField(res[2], is_target=True),
                         head_labels=TextField(res[3], is_target=True)))

        return ds
Exemple #19
0
 def convert(self, data):
     data_set = DataSet()
     for item in data:
         sent_words = item[0]
         if self.pos is True and self.ner is True:
             instance = Instance(words=sent_words,
                                 pos_tags=item[1],
                                 ner=item[2])
         elif self.pos is True:
             instance = Instance(words=sent_words, pos_tags=item[1])
         elif self.ner is True:
             instance = Instance(words=sent_words, ner=item[1])
         else:
             instance = Instance(words=sent_words)
         data_set.append(instance)
     data_set.apply(lambda ins: len(ins["words"]), new_field_name="seq_len")
     return data_set
Exemple #20
0
 def test(self):
     data = DataSet()
     for text in texts:
         x = TextField(text, is_target=False)
         ins = Instance(text=x)
         data.append(ins)
     data_set = create_dataset_from_lists(texts, vocab, has_target=False)
     self.assertTrue(type(data) == type(data_set))
Exemple #21
0
    def load(self, path):
        """
        返回的DataSet, 包含以下的field
            words:list of str,
            tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..],会被认为是["S-VP", "B-NN", "M-NN",..]
        假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即
        ::

            1	编者按	编者按	NN	O	11	nmod:topic
            2	:	:	PU	O	11	punct
            3	7月	7月	NT	DATE	4	compound:nn
            4	12日	12日	NT	DATE	11	nmod:tmod
            5	,	,	PU	O	11	punct

            1	这	这	DT	O	3	det
            2	款	款	M	O	1	mark:clf
            3	飞行	飞行	NN	O	8	nsubj
            4	从	从	P	O	5	case
            5	外型	外型	NN	O	8	nmod:prep

        """
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet()
        for sample in datalist:
            # print(sample)
            res = self.get_one(sample)
            if res is None:
                continue
            char_seq = []
            pos_seq = []
            for word, tag in zip(res[0], res[1]):
                char_seq.extend(list(word))
                if len(word) == 1:
                    pos_seq.append('S-{}'.format(tag))
                elif len(word) > 1:
                    pos_seq.append('B-{}'.format(tag))
                    for _ in range(len(word) - 2):
                        pos_seq.append('M-{}'.format(tag))
                    pos_seq.append('E-{}'.format(tag))
                else:
                    raise ValueError("Zero length of word detected.")

            ds.append(Instance(words=char_seq, tag=pos_seq))

        return ds
Exemple #22
0
 def test_init_v1(self):
     ds = DataSet([Instance(x=[1, 2, 3, 4], y=[5, 6])] * 40)
     self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays)
     self.assertEqual(ds.field_arrays["x"].content, [
         [1, 2, 3, 4],
     ] * 40)
     self.assertEqual(ds.field_arrays["y"].content, [
         [5, 6],
     ] * 40)
Exemple #23
0
 def convert(self, data):
     data_set = DataSet()
     for item in data:
         sent_words, sent_pos_tag = item[0], item[1]
         data_set.append(Instance(words=sent_words, tags=sent_pos_tag))
     data_set.apply(lambda ins: len(ins), new_field_name="seq_len")
     data_set.set_target("tags")
     data_set.set_input("sent_words")
     data_set.set_input("seq_len")
     return data_set
Exemple #24
0
def convert_seq2seq_dataset(data):
    """Convert list of data into DataSet

    :param data: list of list of strings, [num_examples, *].
            ::
            [
                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
                ...
            ]

    :return: a DataSet.
    """
    dataset = DataSet()
    for sample in data:
        word_seq, label_seq = sample[0], sample[1]
        ins = Instance()
        ins.add_field("word_seq", TextField(word_seq, is_target=False)) \
            .add_field("label_seq", TextField(label_seq, is_target=True))
        dataset.append(ins)
    return dataset
Exemple #25
0
def construct_dataset(sentences):
    """Construct a data set from a list of sentences.

    :param sentences: list of list of str
    :return dataset: a DataSet object
    """
    dataset = DataSet()
    for sentence in sentences:
        instance = Instance()
        instance['raw_sentence'] = sentence
        dataset.append(instance)
    return dataset
Exemple #26
0
 def convert_with_vocabs(self, data, vocabs):
     for example in data:
         word_seq, label_seq = example[0], example[1]
         # list, list
         x = TextField(word_seq, is_target=False)
         x_len = LabelField(len(word_seq), is_target=False)
         y = TextField(label_seq, is_target=False)
         instance = Instance()
         instance.add_field("word_seq", x)
         instance.add_field("truth", y)
         instance.add_field("word_seq_origin_len", x_len)
         self.append(instance)
     self.index_field("word_seq", vocabs["word_vocab"])
     self.index_field("truth", vocabs["label_vocab"])
Exemple #27
0
    def convert(self, parsed_data):
        dataset = DataSet()
        for sample in parsed_data:
            label0_list = list(map(lambda labels: labels[0], sample[1]))
            label1_list = list(map(lambda labels: labels[1], sample[1]))
            label2_list = list(map(lambda labels: labels[2], sample[1]))
            dataset.append(
                Instance(token_list=sample[0],
                         label0_list=label0_list,
                         label1_list=label1_list,
                         label2_list=label2_list))

        return dataset
Exemple #28
0
def convert(data):
    dataset = DataSet()
    for sample in data:
        word_seq = [BOS] + sample['words']
        pos_seq = [BOS] + sample['pos_tags']
        heads = [0] + sample['heads']
        head_tags = [BOS] + sample['labels']
        dataset.append(
            Instance(raw_words=word_seq,
                     pos=pos_seq,
                     gold_heads=heads,
                     arc_true=heads,
                     tags=head_tags))
    return dataset
Exemple #29
0
def convert(data):
    dataset = DataSet()
    for sample in data:
        word_seq = [BOS] + sample[0]
        pos_seq = [BOS] + sample[1]
        heads = [0] + list(map(int, sample[2]))
        head_tags = [BOS] + sample[3]
        dataset.append(
            Instance(words=word_seq,
                     pos=pos_seq,
                     gold_heads=heads,
                     arc_true=heads,
                     tags=head_tags))
    return dataset
Exemple #30
0
 def convert(self, data):
     dataset = DataSet()
     for sample in data:
         word_seq = [BOS] + sample[0] + [EOS]
         pos_seq = [BOS] + sample[1] + [EOS]
         heads = [0] + list(map(int, sample[2])) + [0]
         head_tags = [BOS] + sample[3] + [EOS]
         dataset.append(
             Instance(word_seq=TextField(word_seq, is_target=False),
                      pos_seq=TextField(pos_seq, is_target=False),
                      gold_heads=SeqLabelField(heads, is_target=False),
                      head_indices=SeqLabelField(heads, is_target=True),
                      head_labels=TextField(head_tags, is_target=True)))
     return dataset