コード例 #1
0
ファイル: cws_reader.py プロジェクト: zhangxt/fastNLP
    def load(self, path, cut_long_sent=False):
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet()
        for sample in datalist:
            # print(sample)
            res = self.get_one(sample)
            if res is None:
                continue
            line = '  '.join(res)
            if cut_long_sent:
                sents = cut_long_sentence(line)
            else:
                sents = [line]
            for raw_sentence in sents:
                ds.append(Instance(raw_sentence=raw_sentence))

        return ds
コード例 #2
0
    def load(self, path):
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet(name='conll')
        for sample in datalist:
            # print(sample)
            res = self.get_one(sample)
            ds.append(
                Instance(word_seq=TextField(res[0], is_target=False),
                         pos_seq=TextField(res[1], is_target=False),
                         head_indices=SeqLabelField(res[2], is_target=True),
                         head_labels=TextField(res[3], is_target=True)))

        return ds
コード例 #3
0
 def test_add_field(self):
     ds = DataSet({"x": [3, 4]})
     ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']],
                  is_input=True,
                  is_target=True)
     # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y')
     print(ds)
コード例 #4
0
ファイル: dataset_loader.py プロジェクト: wzhystar/fastNLP
    def load(self, path):
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        data = [self.get_one(sample) for sample in datalist]
        data_list = list(filter(lambda x: x is not None, data))

        ds = DataSet()
        for example in data_list:
            ds.append(
                Instance(words=example[0],
                         pos_tags=example[1],
                         heads=example[2],
                         labels=example[3]))
        return ds
コード例 #5
0
    def test_save_load(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
        ds.save("./my_ds.pkl")
        self.assertTrue(os.path.exists("./my_ds.pkl"))

        ds_1 = DataSet.load("./my_ds.pkl")
        os.remove("my_ds.pkl")
コード例 #6
0
 def test_init_assert(self):
     with self.assertRaises(AssertionError):
         _ = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 100})
     with self.assertRaises(AssertionError):
         _ = DataSet([[1, 2, 3, 4]] * 10)
     with self.assertRaises(ValueError):
         _ = DataSet(0.00001)
コード例 #7
0
 def test_append(self):
     dd = DataSet()
     for _ in range(3):
         dd.append(Instance(x=[1, 2, 3, 4], y=[5, 6]))
     self.assertEqual(len(dd), 3)
     self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3, 4]] * 3)
     self.assertEqual(dd.field_arrays["y"].content, [[5, 6]] * 3)
コード例 #8
0
ファイル: cws_reader.py プロジェクト: zhangxt/fastNLP
 def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
     if in_word_splitter is None:
         in_word_splitter = self.in_word_splitter
     dataset = DataSet()
     with open(filepath, 'r') as f:
         words = []
         for line in f:
             line = line.strip()
             if len(line) == 0:  # new line
                 if len(words) == 0:  # 不能接受空行
                     continue
                 line = ' '.join(words)
                 if cut_long_sent:
                     sents = cut_long_sentence(line)
                 else:
                     sents = [line]
                 for sent in sents:
                     instance = Instance(raw_sentence=sent)
                     dataset.append(instance)
                 words = []
             else:
                 line = line.split()[0]
                 if in_word_splitter is None:
                     words.append(line)
                 else:
                     words.append(line.split(in_word_splitter)[0])
     return dataset
コード例 #9
0
ファイル: cws_reader.py プロジェクト: zhangxt/fastNLP
    def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
        """
        允许使用的情况有(默认以\t或空格作为seg)
            这是 fastNLP , 一个 非常 good 的 包 .
        和
            也/D  在/P  團員/Na  之中/Ng  ,/COMMACATEGORY
        如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]
        :param filepath:
        :param in_word_splitter:
        :return:
        """
        if in_word_splitter == None:
            in_word_splitter = self.in_word_splitter
        dataset = DataSet()
        with open(filepath, 'r') as f:
            for line in f:
                line = line.strip()
                if len(line.replace(' ', '')) == 0:  # 不能接受空行
                    continue

                if not in_word_splitter is None:
                    words = []
                    for part in line.split():
                        word = part.split(in_word_splitter)[0]
                        words.append(word)
                        line = ' '.join(words)
                if cut_long_sent:
                    sents = cut_long_sentence(line)
                else:
                    sents = [line]
                for sent in sents:
                    instance = Instance(raw_sentence=sent)
                    dataset.append(instance)

        return dataset
コード例 #10
0
    def convert(self, data):
        """Convert a 3D list to a DataSet object.

        :param data: A 3D tensor.
            [
                [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
                [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
                ...
            ]
        :return: data_set: A DataSet object.
        """

        data_set = DataSet()

        for example in data:
            p, h, l = example
            # list, list, str
            x1 = TextField(p, is_target=False)
            x2 = TextField(h, is_target=False)
            x1_len = TextField([1] * len(p), is_target=False)
            x2_len = TextField([1] * len(h), is_target=False)
            y = LabelField(l, is_target=True)
            instance = Instance()
            instance.add_field("premise", x1)
            instance.add_field("hypothesis", x2)
            instance.add_field("premise_len", x1_len)
            instance.add_field("hypothesis_len", x2_len)
            instance.add_field("truth", y)
            data_set.append(instance)

        return data_set
コード例 #11
0
 def test_drop(self):
     ds = DataSet({
         "x": [[1, 2, 3, 4]] * 40,
         "y": [[5, 6], [7, 8, 9, 0]] * 20
     })
     ds.drop(lambda ins: len(ins["y"]) < 3)
     self.assertEqual(len(ds), 20)
コード例 #12
0
    def predict(self, content):
        """

        :param content: list of list of str. Each string is a token(word).
        :return answer: list of list of str. Each string is a tag.
        """
        if not hasattr(self, 'pipeline'):
            raise ValueError("You have to load model first.")

        sentence_list = []
        # 1. 检查sentence的类型
        if isinstance(content, str):
            sentence_list.append(content)
        elif isinstance(content, list):
            sentence_list = content

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field('words', sentence_list)

        # 3. 使用pipeline
        self.pipeline(dataset)

        output = dataset['word_pos_output'].content
        if isinstance(content, str):
            return output[0]
        elif isinstance(content, list):
            return output
コード例 #13
0
ファイル: load_data.py プロジェクト: SmallStom/NER
 def _load(self, path: str = None):
     logging.info(path)
     ds = DataSet()
     with open(path, 'r', encoding='utf-8') as f:
         for line in f:
             if line == '': continue
             splits = line.strip().split('\t')
             if len(splits) == 4:
                 raw_targets = [int(i) for i in splits[3].strip().lstrip('[').rstrip(']').split(' ')]
             elif len(splits) == 3:
                 raw_targets = [0, 0, 0, 0, 0]
             else:
                 logging.error('data format error')
             raw_query = splits[0]
             raw_entity = splits[1]
             left_context = raw_query[0:raw_query.find(raw_entity)]
             right_context = raw_query[raw_query.find(raw_entity) + len(raw_entity):]
             if left_context == '': left_context = '-'
             if right_context == '': right_context = '-'
             raw_entity_label = splits[2]
             if left_context and right_context and raw_entity and raw_entity_label:
                 ds.append(
                     Instance(left_context=tokenize(left_context),
                              right_context=tokenize(right_context),
                              raw_entity=tokenize(raw_entity),
                              raw_entity_label=entity_label_tokenize(raw_entity_label),
                              target=raw_targets))
     return ds
コード例 #14
0
    def _load(self, path):
        ds = DataSet()
        for idx, data in _read_conll(path,
                                     indexes=self.indexes,
                                     dropna=self.dropna):
            #            if data[0][0] == '#':
            #                data[0] = data[0][1:]
            #                data[1] = data[1][1:]
            for i in range(len(self.headers)):
                if data[i][0].startswith('NE-'):
                    data[i] = data[i][1:]
                if 'TOKEN' in data[i][0]:
                    data[i] = data[i][1:]

            # print(data) #data[1] = iob(list(data[1]))
            doc_start = False
            for i, h in enumerate(self.headers):
                field = data[i]
                if str(' '.join(list(field))).startswith(' #'):
                    continue
                if str(field[0]).startswith('-DOCSTART-'):
                    doc_start = True
                    break
            if doc_start:
                continue
            ins = {h: data[i] for i, h in enumerate(self.headers)}
            ds.append(Instance(**ins))
        if len(ds) == 0:
            raise RuntimeError("No data found {}.".format(path))
        return ds
コード例 #15
0
    def predict(self, content):
        """

        :param content: list of list of str. Each string is a token(word).
        :return answer: list of list of str. Each string is a tag.
        """
        if not hasattr(self, "pipeline"):
            raise ValueError("You have to load model first.")

        sentence_list = content
        # 1. 检查sentence的类型
        for sentence in sentence_list:
            if not all((type(obj) == str for obj in sentence)):
                raise ValueError("Input must be list of list of string.")

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field("words", sentence_list)

        # 3. 使用pipeline
        self.pipeline(dataset)

        def merge_tag(words_list, tags_list):
            rtn = []
            for words, tags in zip(words_list, tags_list):
                rtn.append([w + "/" + t for w, t in zip(words, tags)])
            return rtn

        output = dataset.field_arrays["tag"].content
        if isinstance(content, str):
            return output[0]
        elif isinstance(content, list):
            return merge_tag(content, output)
コード例 #16
0
    def predict(self, content):
        """
        分词接口。

        :param content: str或List[str], 例如: "中文分词很重要!", 返回的结果是"中文 分词 很 重要 !"。 如果传入的为List[str],比如
            [ "中文分词很重要!", ...], 返回的结果["中文 分词 很 重要 !", ...]。
        :return: str或List[str], 根据输入的的类型决定。
        """
        if not hasattr(self, 'pipeline'):
            raise ValueError("You have to load model first.")

        sentence_list = []
        # 1. 检查sentence的类型
        if isinstance(content, str):
            sentence_list.append(content)
        elif isinstance(content, list):
            sentence_list = content

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field('raw_sentence', sentence_list)

        # 3. 使用pipeline
        self.pipeline(dataset)

        output = dataset.get_field('output').content
        if isinstance(content, str):
            return output[0]
        elif isinstance(content, list):
            return output
コード例 #17
0
 def test(self):
     data = DataSet()
     for text in texts:
         x = TextField(text, is_target=False)
         ins = Instance(text=x)
         data.append(ins)
     data_set = create_dataset_from_lists(texts, vocab, has_target=False)
     self.assertTrue(type(data) == type(data_set))
コード例 #18
0
    def test_get_item_error(self):
        with self.assertRaises(RuntimeError):
            ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
            _ = ds[40:]

        with self.assertRaises(KeyError):
            ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
            _ = ds["kom"]
コード例 #19
0
 def test_get_field(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
     ans = ds.get_field("x")
     self.assertTrue(isinstance(ans, FieldArray))
     self.assertEqual(ans.content, [[1, 2, 3, 4]] * 10)
     ans = ds.get_field("y")
     self.assertTrue(isinstance(ans, FieldArray))
     self.assertEqual(ans.content, [[5, 6]] * 10)
コード例 #20
0
ファイル: dataset_loader.py プロジェクト: wzhystar/fastNLP
    def load(self, path):
        """
        返回的DataSet, 包含以下的field
            words:list of str,
            tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..],会被认为是["S-VP", "B-NN", "M-NN",..]
        假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即
        ::

            1	编者按	编者按	NN	O	11	nmod:topic
            2	:	:	PU	O	11	punct
            3	7月	7月	NT	DATE	4	compound:nn
            4	12日	12日	NT	DATE	11	nmod:tmod
            5	,	,	PU	O	11	punct

            1	这	这	DT	O	3	det
            2	款	款	M	O	1	mark:clf
            3	飞行	飞行	NN	O	8	nsubj
            4	从	从	P	O	5	case
            5	外型	外型	NN	O	8	nmod:prep

        """
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet()
        for sample in datalist:
            # print(sample)
            res = self.get_one(sample)
            if res is None:
                continue
            char_seq = []
            pos_seq = []
            for word, tag in zip(res[0], res[1]):
                char_seq.extend(list(word))
                if len(word) == 1:
                    pos_seq.append('S-{}'.format(tag))
                elif len(word) > 1:
                    pos_seq.append('B-{}'.format(tag))
                    for _ in range(len(word) - 2):
                        pos_seq.append('M-{}'.format(tag))
                    pos_seq.append('E-{}'.format(tag))
                else:
                    raise ValueError("Zero length of word detected.")

            ds.append(Instance(words=char_seq, tag=pos_seq))

        return ds
コード例 #21
0
    def test_case_TokenizeDatasetLoader(self):
        loader = TokenizeDataSetLoader()
        filepath = "./test/data_for_tests/cws_pku_utf_8"
        data = loader.load(filepath, max_seq_len=32)
        assert len(data) > 0

        data1 = DataSet()
        data1.read_tokenize(filepath, max_seq_len=32)
        assert len(data1) > 0
        print("pass TokenizeDataSetLoader test!")
コード例 #22
0
 def formatRowString(self, msg):
     msg = msg.strip()
     tokenized_char = [x for x in msg]
     self._dataset = DataSet()
     if self._addTarget2Vocab:
         ins = Instance(chars=tokenized_char,
                        raw_chars=tokenized_char,
                        target=list(dict(self._target_vocab).keys()))
     else:
         ins = Instance(chars=tokenized_char, raw_chars=tokenized_char)
     self._dataset.append(ins)
コード例 #23
0
    def test_input_target(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
        ds.set_input("x")
        ds.set_target("y")
        self.assertTrue(ds.field_arrays["x"].is_input)
        self.assertTrue(ds.field_arrays["y"].is_target)

        with self.assertRaises(KeyError):
            ds.set_input("xxx")
        with self.assertRaises(KeyError):
            ds.set_input("yyy")
コード例 #24
0
    def test_add_append(self):
        dd = DataSet()
        dd.add_field("x", [[1, 2, 3]] * 10)
        dd.add_field("y", [[1, 2, 3, 4]] * 10)
        dd.add_field("z", [[5, 6]] * 10)
        self.assertEqual(len(dd), 10)
        self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3]] * 10)
        self.assertEqual(dd.field_arrays["y"].content, [[1, 2, 3, 4]] * 10)
        self.assertEqual(dd.field_arrays["z"].content, [[5, 6]] * 10)

        with self.assertRaises(RuntimeError):
            dd.add_field("??", [[1, 2]] * 40)
コード例 #25
0
ファイル: dataset_loader.py プロジェクト: huziye/fastNLP_fork
    def convert(self, parsed_data):
        dataset = DataSet()
        for sample in parsed_data:
            label0_list = list(map(lambda labels: labels[0], sample[1]))
            label1_list = list(map(lambda labels: labels[1], sample[1]))
            label2_list = list(map(lambda labels: labels[2], sample[1]))
            dataset.append(
                Instance(token_list=sample[0],
                         label0_list=label0_list,
                         label1_list=label1_list,
                         label2_list=label2_list))

        return dataset
コード例 #26
0
 def test_delete_field(self):
     dd = DataSet()
     dd.add_field("x", [[1, 2, 3]] * 10)
     dd.add_field("y", [[1, 2, 3, 4]] * 10)
     dd.delete_field("x")
     self.assertFalse("x" in dd.field_arrays)
     self.assertTrue("y" in dd.field_arrays)
コード例 #27
0
ファイル: test_trainer.py プロジェクト: yhcc/fastNLP
    def test_case_1(self):
        args = {
            "epochs": 3,
            "batch_size": 2,
            "validate": False,
            "use_cuda": False,
            "pickle_path": "./save/",
            "save_best_dev": True,
            "model_name": "default_model_name.pkl",
            "loss": Loss("cross_entropy"),
            "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0),
            "vocab_size": 10,
            "word_emb_dim": 100,
            "rnn_hidden_units": 100,
            "num_classes": 5,
            "evaluator": SeqLabelEvaluator()
        }
        trainer = SeqLabelTrainer(**args)

        train_data = [
            [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']],
            [['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
        ]
        vocab = {
            'a': 0,
            'b': 1,
            'c': 2,
            'd': 3,
            'e': 4,
            '!': 5,
            '@': 6,
            '#': 7,
            '$': 8,
            '?': 9
        }
        label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4}

        data_set = DataSet()
        for example in train_data:
            text, label = example[0], example[1]
            x = TextField(text, False)
            x_len = LabelField(len(text), is_target=False)
            y = TextField(label, is_target=False)
            ins = Instance(word_seq=x, truth=y, word_seq_origin_len=x_len)
            data_set.append(ins)

        data_set.index_field("word_seq", vocab)
        data_set.index_field("truth", label_vocab)

        model = SeqLabeling(args)

        trainer.train(network=model, train_data=data_set, dev_data=data_set)
        # If this can run, everything is OK.

        os.system("rm -rf save")
        print("pickle path deleted")
コード例 #28
0
    def test_reader(self):
        # 跑通即可
        ds = DataSet().read_naive(
            "test/data_for_tests/tutorial_sample_dataset.csv")
        self.assertTrue(isinstance(ds, DataSet))
        self.assertTrue(len(ds) > 0)

        ds = DataSet().read_rawdata("test/data_for_tests/people_daily_raw.txt")
        self.assertTrue(isinstance(ds, DataSet))
        self.assertTrue(len(ds) > 0)

        ds = DataSet().read_pos("test/data_for_tests/people.txt")
        self.assertTrue(isinstance(ds, DataSet))
        self.assertTrue(len(ds) > 0)
コード例 #29
0
ファイル: run.py プロジェクト: huziye/fastNLP_fork
def convert(data):
    dataset = DataSet()
    for sample in data:
        word_seq = [BOS] + sample[0]
        pos_seq = [BOS] + sample[1]
        heads = [0] + list(map(int, sample[2]))
        head_tags = [BOS] + sample[3]
        dataset.append(
            Instance(words=word_seq,
                     pos=pos_seq,
                     gold_heads=heads,
                     arc_true=heads,
                     tags=head_tags))
    return dataset
コード例 #30
0
 def convert(self, data):
     dataset = DataSet()
     for sample in data:
         word_seq = [BOS] + sample[0] + [EOS]
         pos_seq = [BOS] + sample[1] + [EOS]
         heads = [0] + list(map(int, sample[2])) + [0]
         head_tags = [BOS] + sample[3] + [EOS]
         dataset.append(
             Instance(word_seq=TextField(word_seq, is_target=False),
                      pos_seq=TextField(pos_seq, is_target=False),
                      gold_heads=SeqLabelField(heads, is_target=False),
                      head_indices=SeqLabelField(heads, is_target=True),
                      head_labels=TextField(head_tags, is_target=True)))
     return dataset