Esempio n. 1
0
    def process(self, data_bundle: DataBundle) -> DataBundle:

        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(self.convert_tag,
                                field_name=Const.TARGET,
                                new_field_name=Const.TARGET)

        _add_words_field(data_bundle, lower=self.lower)

        if self.word_shape:
            data_bundle.apply_field(word_shape,
                                    field_name='raw_words',
                                    new_field_name='word_shapes')
            data_bundle.set_input('word_shapes')

        data_bundle.apply_field(lambda chars: [
            ''.join(['0' if c.isdigit() else c for c in char])
            for char in chars
        ],
                                field_name=Const.INPUT,
                                new_field_name=Const.INPUT)

        _indexize(data_bundle,
                  target_field_names=['target'],
                  vocabulary=self.vocabulary)
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)
        return data_bundle
Esempio n. 2
0
    def load(self, folder):
        fns ={
            'dev':'{}_dev.csv'.format(self.lg1_lg2),
            'test':'{}_test500.csv'.format(self.lg1_lg2),
            'train': '{}_train500_10.csv'.format(self.lg1_lg2)
        }
        target_lg = self.lg1_lg2.split('_')[0]
        data_bundle = DataBundle()
        for name, fn in fns.items():
            path = os.path.join(folder, fn)
            ds = DataSet()
            with open(path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        parts = line.split('\t')
                        if self.lower:
                            ins = Instance(word=parts[1].lower(), definition=parts[-1].lower())
                        else:
                            ins = Instance(word=parts[1], definition=parts[-1])
                        ds.append(ins)
            data_bundle.set_dataset(ds, name=name)
        target_words = {}
        with open(os.path.join(folder, '{}.txt'.format(target_lg)), encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    if self.lower:
                        line = line.lower()
                    target_words[line] = 1
        target_words = list(target_words.keys())

        setattr(data_bundle, 'target_words', target_words)
        return data_bundle
Esempio n. 3
0
    def process(self, data_bundle: DataBundle):
        _add_chars_field(data_bundle, lower=False)

        data_bundle.apply_field(self.encoding_func, field_name=Const.TARGET, new_field_name=Const.TARGET)

        # 将所有digit转为0
        data_bundle.apply_field(lambda chars:[''.join(['0' if c.isdigit() else c for c in char]) for char in chars],
            field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT)

        #
        input_field_names = [Const.CHAR_INPUT]
        if self.bigrams:
            data_bundle.apply_field(lambda chars:[c1+c2 for c1,c2 in zip(chars, chars[1:]+['<eos>'])],
                                    field_name=Const.CHAR_INPUT, new_field_name='bigrams')
            input_field_names.append('bigrams')

        # index
        _indexize(data_bundle, input_field_names=input_field_names, target_field_names=Const.TARGET)

        input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.CHAR_INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Esempio n. 4
0
 def process(self, data_bundle: DataBundle) -> DataBundle:
     '''
     :param data_bundle:  databundler里的dataset列为 raw_words|index|target|comment ->用‘raw_words’增加 words_bert_ids|e1e|e1e|e2b|e2e ,还需设置is_input,is_target列属性
     :return: 对databundle中的dataset进行扩展
     '''
     for name,dataset in data_bundle.datasets.items():
         dataset.apply_field_more(func=self.raw_words2words_func,field_name='raw_words',modify_fields=True)
     data_bundle.set_input('words_bert_ids','e1b','e1e','e2b','e2e')
     data_bundle.set_target('target')
     return data_bundle
Esempio n. 5
0
    def load(self, folder):
        data_bundle = DataBundle()
        fns = {
            'dev': '{}_dev.csv',
            # 'test':'{}_test500.csv'.format(self.lg1_lg2),
            'train': '{}_train500_10.csv'
        }
        data_bundle = DataBundle()
        words = {}
        for lg in ['en', 'es', 'fr']:
            for name, fn in fns.items():
                path = os.path.join(folder, fn.format(lg))
                ds = read_dataset(path, self.lower, 0)
                data_bundle.set_dataset(ds, name=f'{lg}_{name}')
            target_words = {}
            with open(os.path.join(folder, '{}.txt'.format(lg)),
                      encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        if self.lower:
                            line = line.lower()
                        target_words[line] = 1
            target_words = list(target_words.keys())
            words[lg] = target_words
        setattr(data_bundle, 'target_words_dict', words)

        for bi in ['en_fr', 'fr_en', 'en_es', 'es_en']:
            path = os.path.join(folder, '{}_test500.csv'.format(bi))
            ds = read_dataset(path, self.lower, 1)
            data_bundle.set_dataset(ds, '{}_test'.format(bi))

        return data_bundle
Esempio n. 6
0
    def process(self, data_bundle: DataBundle) -> DataBundle:
        """
        支持的DataSet的field为

        .. csv-table::
           :header: "raw_words", "target"

           "[Nadim, Ladki]", "[B-PER, I-PER]"
           "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]"
           "[...]", "[...]"

        :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]在传入DataBundle基础上原位修改。
        :return DataBundle:
        """
        # 转换tag
        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(self.convert_tag,
                                field_name=Const.TARGET,
                                new_field_name=Const.TARGET)

        _add_words_field(data_bundle, lower=self.lower)

        if self.word_shape:
            data_bundle.apply_field(word_shape,
                                    field_name='raw_words',
                                    new_field_name='word_shapes')
            data_bundle.set_input('word_shapes')

        # 将所有digit转为0
        data_bundle.apply_field(lambda chars: [
            ''.join(['0' if c.isdigit() else c for c in char])
            for char in chars
        ],
                                field_name=Const.INPUT,
                                new_field_name=Const.INPUT)

        # index
        _indexize(data_bundle)

        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Esempio n. 7
0
 def load(self, paths, is_lower):
     if paths is None:
         raise NotImplementedError(f"I am not ready for downloading!")
     paths = check_loader_paths(paths)
     datasets = {
         name: self._load(path, is_lower)
         for name, path in paths.items()
     }
     data_bundle = DataBundle(datasets=datasets)
     return data_bundle
Esempio n. 8
0
    def load(self, paths):
        """
        输出的DataSet包含以下的field
        tokens                  pos                   dep                                    aspects
        ["The", "bread", ...]   ["DET", "NOUN",...]   [["dep", 2, 1], ["nsubj", 4, 2], ...]  [{"term": ["bread"], "polarity": "positive", "from": 1, "to": 2}]
        其中dep中["dep", 2, 1]指当前这个word的head是2(0是root,这里2就是bread),"dep"是依赖关系为dep

        :param paths:
        :return:
        """
        data_bundle = DataBundle()
        folder_name = os.path.basename(paths)
        fns = [
            f'{folder_name}_Test_biaffine_depparsed.json',
            f'{folder_name}_Train_biaffine_depparsed.json'
        ]
        if not os.path.exists(os.path.join(paths, fns[0])):
            fns = [
                f'Test_biaffine_depparsed.json',
                f'Train_biaffine_depparsed.json'
            ]

        for split, name in zip(['test', 'train'], fns):
            fp = os.path.join(paths, name)
            with open(fp, 'r', encoding='utf-8') as f:
                data = json.load(f)
            ds = DataSet()
            for ins in data:
                tokens = ins['token']
                pos = ins['pos']
                dep = ins['dependencies']
                aspects = ins['aspects']
                ins = Instance(tokens=tokens,
                               pos=pos,
                               dep=dep,
                               aspects=aspects)
                ds.append(ins)
            data_bundle.set_dataset(ds, name=split)
        return data_bundle


# c = ConllUDataset('./data/EWT/en_ewt-ud-test.conllu')
# print('done')
Esempio n. 9
0
    def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle:

        if paths is None:
            paths = self.download()
        paths = check_loader_paths(paths)

        datasets = {}
        for name, path in paths.items():
            datasets[name] = self._load(path)
        data_bundle = DataBundle(datasets=datasets)
        return data_bundle
Esempio n. 10
0
 def load(self, folder):
     data_bundle = DataBundle()
     for name in ['desc.json', 'dev.json', 'seen.json', 'train.json', 'unseen.json']:
         path = os.path.join(folder, name)
         dataset = DataSet()
         with open(path, 'r', encoding='utf-8') as f:
             data = json.load(f)
             for d in data:
                 word = d['word'].lower()
                 definition = d['definitions'].lower()
                 ins = Instance(word=word, definition=definition)
                 dataset.append(ins)
             data_bundle.set_dataset(dataset, name=name.split('.')[0])
     words = []
     with open(os.path.join(folder, 'target_words.txt'), 'r', encoding='utf-8') as f:
         for line in f:
             line = line.strip()
             if line:
                 words.append(line)
     setattr(data_bundle, 'target_words', words)
     return data_bundle
Esempio n. 11
0
 def load(self, paths: Union[str, Dict[str, str]] = None,ratio_tr_d_te:tuple =ratio_tr_d_te) -> DataBundle:
     '''
      :param paths: 为str时,读入训练集合的所有训练样本,并在训练集的基础上按比例8:1:1划分
                 为Dict[str,str]时,通过test,val,train这种键值来pick训练、测试、验证集,(train必须要有)
                 如果没有val、test,从train中划分一定比例充当验证/测试集
     :return:
     '''
     paths =self.check_loader_paths(paths)  #此时的paths是字典{'train':XXX,..}
     datasets ={_:self._load(path=path) for _,path in paths.items()}
     # 对所有数据做shuffle 处理
     for name,ds in datasets.items():
         shuffled_ds =DataSet()
         indices =[_ for _ in range(len(ds))]
         random.shuffle(indices)
         for _ in indices:
             shuffled_ds.append(ds[_])
         datasets[name] =shuffled_ds
     # shuffle 处理结束
     if len(datasets) ==1:
         print('检测到只load train中的dataset,默认8:1:1拆分为train/test/val 三份集合')
         ds =datasets['train']
         train_count =int(len(ds)*(ratio_tr_d_te[0]/sum(ratio_tr_d_te)))
         test_count = int(len(ds)*(ratio_tr_d_te[2]/sum(ratio_tr_d_te)))
         return DataBundle(datasets={'train':ds[:train_count],'val':ds[train_count:-test_count],'test':ds[-test_count:]})
     elif len(datasets) ==3:
         print('检测到train,test,val,不需要从train划分')
         return DataBundle(datasets=datasets)
     elif 'val' not in datasets:
         print('检测到train,test,从train划分出val')
         ds = datasets['train']
         val_count = int(len(ds) * (ratio_tr_d_te[1] / sum(ratio_tr_d_te)))
         return DataBundle(datasets= {'train': ds[:-val_count], 'val': ds[-val_count:], 'test': datasets['test']})
     elif 'test' not in datasets:
         print('检测到train,val,从train划分出test')
         ds = datasets['train']
         test_count = int(len(ds) * (ratio_tr_d_te[2] / sum(ratio_tr_d_te)))
         return DataBundle(datasets={'train': ds[:-test_count], 'val': ds[-test_count:], 'test': datasets['test']})
Esempio n. 12
0
    def load(self, paths):
        """
        输出的DataSet包含以下的field
        tokens                  pos                   dep                                    aspects
        ["The", "bread", ...]   ["DET", "NOUN",...]   [["dep", 2, 1], ["nsubj", 4, 2], ...]  [{"term": ["bread"], "polarity": "positive", "from": 1, "to": 2}]
        其中dep中["dep", 2, 1]指当前这个word的head是2(0是root,这里2就是bread),"dep"是依赖关系为dep

        :param paths:
        :return:
        """
        data_bundle = DataBundle()
        folder_name = os.path.basename(paths)
        fns = [
            f"{folder_name}_Test.json",
            f"{folder_name}_Train.json",
        ]
        if not os.path.exists(os.path.join(paths, fns[0])):
            fns = [f"Test.json", f"Train.json"]

        for split, name in zip(["test", "train"], fns):
            fp = os.path.join(paths, name)
            with open(fp, "r", encoding="utf-8") as f:
                data = json.load(f)
            ds = DataSet()
            for ins in data:
                tokens = ins["token"]
                pos = ins["pos"]
                dep = ins["dependencies"]
                aspects = ins["aspects"]
                ins = Instance(tokens=tokens,
                               pos=pos,
                               dep=dep,
                               aspects=aspects)
                ds.append(ins)
            data_bundle.set_dataset(ds, name=split)
        return data_bundle
Esempio n. 13
0
 def test_demo(self):
     # related to issue https://github.com/fastnlp/fastNLP/issues/324#issue-705081091
     from fastNLP import DataSet, Instance
     from fastNLP.io import DataBundle
     data_bundle = DataBundle()
     ds = DataSet()
     ds.append(Instance(raw_words="截流 进入 最后 冲刺 ( 附 图片 1 张 )"))
     data_bundle.set_dataset(ds, name='train')
     data_bundle = CWSPipe().process(data_bundle)
     self.assertFalse('<' in data_bundle.get_vocab('chars'))
 def process(self, data_bundle: DataBundle) -> DataBundle:
     data_bundle.copy_field(field_name=C.RAW_WORD,
                            new_field_name=C.INPUT,
                            ignore_miss_dataset=True)
     for name, dataset in data_bundle.datasets.items():
         dataset.apply_field(self.copy_func,
                             field_name=C.RAW_WORD,
                             new_field_name=C.INPUT)
         dataset.add_seq_len(C.INPUT)  # 这里没有用Const.INPUT=words而是 raw_words
     data_bundle.set_input(C.INPUT, C.INPUT_LEN)
     data_bundle.set_target(C.TARGET)  # Const.TARGET ,'target'
     return data_bundle
 def load(
     self,
     paths: Union[str, Dict[str, str]] = None,
     ratio_train_dev_test: tuple = (8, 1, 1)
 ) -> tuple:
     '''
     调用_load函数,对其return value做数据集划分的处理,(train,val),test
     :param paths:
     :return: DataBundle
     '''
     datasets, id2country_dict = self._load(paths)
     train_data = DataSet()
     dev_data = DataSet()
     test_data = DataSet()
     indices = [_ for _ in range(len(datasets))]
     random.shuffle(indices)
     train_count = int(
         len(datasets) *
         (ratio_train_dev_test[0] / sum(ratio_train_dev_test)))
     dev_count = int(
         len(datasets) *
         (ratio_train_dev_test[1] / sum(ratio_train_dev_test)))
     test_count = int(
         len(datasets) *
         (ratio_train_dev_test[2] / sum(ratio_train_dev_test)))
     train_indices = indices[:train_count]
     dev_indices = indices[train_count:train_count + dev_count]
     test_indices = indices[train_count + dev_count:]
     for idx in train_indices:
         train_data.append(datasets[idx])
     for idx in dev_indices:
         dev_data.append(datasets[idx])
     for idx in test_indices:
         test_data.append(datasets[idx])
     warnings.warn('分割train/dev/test集合,count:{}/{}/{}'.format(
         len(train_data), len(dev_data), len(test_data)))
     data_set = {'train': train_data, 'dev': dev_data, 'test': test_data}
     return DataBundle(datasets=data_set), id2country_dict
Esempio n. 16
0
    def load(self, folder):
        #  首先读取两个单语文件
        lg1, lg2 = self.lg1_lg2.split('_')
        fns = {
            'dev': '{}_dev.csv',
            # 'test':'{}_test500.csv'.format(self.lg1_lg2),
            'train': '{}_train500_10.csv'
        }
        data_bundle = DataBundle()
        words = {}
        for lg in [lg1, lg2]:
            for name, fn in fns.items():
                path = os.path.join(folder, fn.format(lg))
                ds = read_dataset(path, self.lower, 0)
                data_bundle.set_dataset(ds, name=f'{lg}_{name}')
            target_words = {}
            with open(os.path.join(folder, '{}.txt'.format(lg)),
                      encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        if self.lower:
                            line = line.lower()
                        target_words[line] = 1
            target_words = list(target_words.keys())
            words[lg] = target_words
        setattr(data_bundle, 'target_words_dict', words)

        # 读取bi的测试数据
        bi1 = f'{lg1}_{lg2}'
        bi2 = f'{lg2}_{lg1}'
        for bi in [bi1, bi2]:
            path = os.path.join(folder, '{}_test500.csv'.format(bi))
            ds = read_dataset(path, self.lower, 1)
            # ds = DataSet()
            # with open(path, 'r', encoding='utf-8') as f:
            #     for line in f:
            #         line = line.strip()
            #         if line:
            #             parts = line.split('\t')
            #             ins = Instance(word=parts[1].lower(), definition=parts[-1])
            #             ds.append(ins)
            data_bundle.set_dataset(ds, '{}_test'.format(bi))

        return data_bundle
Esempio n. 17
0
    def process(self, data_bundle: DataBundle) -> DataBundle:
        new_bundle = DataBundle()
        aspect_dict = {}
        mask_id = self.tokenizer.convert_tokens_to_ids([self.mask])[0]
        if isinstance(self.tokenizer, BertTokenizer):
            cls = "[CLS]"
            sep = "[SEP]"
        else:
            cls = self.tokenizer.cls_token
            sep = self.tokenizer.sep_token
        for name, ds in data_bundle.iter_datasets():
            new_ds = DataSet()
            for ins in ds:
                tokens = ins["tokens"]
                if not isinstance(self.tokenizer, XLNetTokenizer):
                    tokens.insert(0, cls)
                    tokens.append(sep)
                    shift = 1
                else:
                    tokens.append(sep)
                    tokens.append(cls)
                    shift = 0

                starts = []
                ends = []
                for aspect in ins["aspects"]:
                    starts.append(aspect["from"] + shift)
                    ends.append(aspect["to"] + shift)
                for aspect in ins["aspects"]:
                    target = aspect["polarity"]
                    start = aspect["from"] + shift
                    end = aspect["to"] + shift
                    aspect_mask = [0] * len(tokens)
                    for i in range(start, end):
                        aspect_mask[i] = 1
                    pieces = []
                    piece_masks = []
                    raw_words = tokens[shift:-1]
                    raw_words.insert(start - 1, "[[")
                    raw_words.insert(end, "]]")
                    for mask, token in zip(aspect_mask, tokens):
                        bpes = self.tokenizer.convert_tokens_to_ids(
                            self.tokenizer.tokenize(token))
                        pieces.extend(bpes)
                        piece_masks.extend([mask] * (len(bpes)))
                    new_ins = Instance(
                        tokens=pieces,
                        target=target,
                        aspect_mask=piece_masks,
                        raw_words=" ".join(raw_words),
                    )
                    new_ds.append(new_ins)
            new_bundle.set_dataset(new_ds, name)

        target_vocab = Vocabulary(padding=None, unknown=None)
        target_vocab.add_word_lst(
            ["neutral", "positive", "negative", "smooth"])
        target_vocab.index_dataset(*new_bundle.datasets.values(),
                                   field_name="target")

        new_bundle.set_target("target")
        new_bundle.set_input("tokens", "aspect_mask", "raw_words")
        new_bundle.apply_field(lambda x: len(x),
                               field_name="tokens",
                               new_field_name="seq_len")

        # new_bundle.set_vocab(vocab, 'tokens')
        if hasattr(self.tokenizer, "pad_token_id"):
            new_bundle.set_pad_val("tokens", self.tokenizer.pad_token_id)
        else:
            new_bundle.set_pad_val("tokens", self.tokenizer.pad_index)
        new_bundle.set_vocab(target_vocab, "target")

        return new_bundle
Esempio n. 18
0
    def process(self, paths):
        """
        
        :param paths: 
        :return:
            Dataset包含以下的field
                chars:
                bigrams:
                trigrams:
                pre_chars:
                pre_bigrams:
                pre_trigrams:
                seg_targets:
                seg_masks:
                seq_lens:
                char_labels:
                char_heads:
                gold_word_pairs:
                seg_targets:
                seg_masks:
                char_labels:
                char_heads:
                pun_masks:
                gold_label_word_pairs:
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()

        for name, path in paths.items():
            print(name,path)
            dataset = self.load(path)
            data.datasets[name] = dataset

        char_labels_vocab = Vocabulary(padding=None, unknown=None)

        def process(dataset, char_label_vocab):
            dataset.apply(add_word_lst, new_field_name='word_lst')
            dataset.apply(lambda x: list(chain(*x['word_lst'])), new_field_name='chars')
            dataset.apply(add_bigram, field_name='chars', new_field_name='bigrams')
            dataset.apply(add_trigram, field_name='chars', new_field_name='trigrams')
            dataset.apply(add_char_heads, new_field_name='char_heads')
            dataset.apply(add_char_labels, new_field_name='char_labels')
            dataset.apply(add_segs, new_field_name='seg_targets')
            dataset.apply(add_mask, new_field_name='seg_masks')
            dataset.add_seq_len('chars', new_field_name='seq_lens')
            dataset.apply(add_pun_masks, new_field_name='pun_masks')
            if len(char_label_vocab.word_count)==0:
                char_label_vocab.from_dataset(dataset, field_name='char_labels')
            char_label_vocab.index_dataset(dataset, field_name='char_labels')
            new_dataset = add_root(dataset)
            new_dataset.apply(add_word_pairs, new_field_name='gold_word_pairs', ignore_type=True)
            global add_label_word_pairs
            add_label_word_pairs = partial(add_label_word_pairs, label_vocab=char_label_vocab)
            new_dataset.apply(add_label_word_pairs, new_field_name='gold_label_word_pairs', ignore_type=True)

            new_dataset.set_pad_val('char_labels', -1)
            new_dataset.set_pad_val('char_heads', -1)

            return new_dataset

        for name in list(paths.keys()):
            dataset = data.datasets[name]
            dataset = process(dataset, char_labels_vocab)
            data.datasets[name] = dataset

        data.vocabs['char_labels'] = char_labels_vocab

        char_vocab = Vocabulary(min_freq=2).from_dataset(data.datasets['train'], field_name='chars')
        bigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='bigrams')
        trigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='trigrams')

        for name in ['chars', 'bigrams', 'trigrams']:
            vocab = Vocabulary().from_dataset(field_name=name, no_create_entry_dataset=list(data.datasets.values()))
            vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name='pre_' + name)
            data.vocabs['pre_{}'.format(name)] = vocab

        for name, vocab in zip(['chars', 'bigrams', 'trigrams'],
                        [char_vocab, bigram_vocab, trigram_vocab]):
            vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name=name)
            data.vocabs[name] = vocab

        for name, dataset in data.datasets.items():
            dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens', 'char_labels', 'char_heads', 'pre_chars',
                                  'pre_bigrams', 'pre_trigrams')
            dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets', 'seg_masks', 'char_labels',
                                   'char_heads',
                                   'pun_masks', 'gold_label_word_pairs')

        return data
Esempio n. 19
0
train_dataset.set_target('target')
test_dataset.set_target('target')

'''build vocabulary'''
vocab = Vocabulary()
vocab.from_dataset(train_dataset, field_name='words', no_create_entry_dataset=[test_dataset])
vocab.index_dataset(train_dataset, test_dataset, field_name='words')

target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab.from_dataset(train_dataset, field_name='target', no_create_entry_dataset=[test_dataset])
target_vocab.index_dataset(train_dataset, test_dataset, field_name='target')

'''build bundle'''
data_dict = {"train":train_dataset, "test":test_dataset}
vocab_dict = {"words":vocab, "target":target_vocab}
data_bundle = DataBundle(vocab_dict, data_dict)
print(data_bundle)

'''build model'''
embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-uncased', include_cls_sep=True)
model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))
# model = BertForSequenceClassification(embed, 2)

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model,
                  optimizer=Adam(model_params=model.parameters(), lr=2e-5),
                  loss=CrossEntropyLoss(), device=device,
                  batch_size=8, dev_data=data_bundle.get_dataset('train'),
                  metrics=AccuracyMetric(), n_epochs=10, print_every=1)
trainer.train()
Esempio n. 20
0
    def process(self, data_bundle: DataBundle) -> DataBundle:
        """
        可以处理的DataSet需要包含raw_words列

        .. csv-table::
           :header: "raw_words"

           "上海 浦东 开发 与 法制 建设 同步"
           "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )"
           "..."

        :param data_bundle:
        :return:
        """
        data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT)

        if self.replace_num_alpha:
            data_bundle.apply_field(_find_and_replace_alpha_spans,
                                    Const.CHAR_INPUT, Const.CHAR_INPUT)
            data_bundle.apply_field(_find_and_replace_digit_spans,
                                    Const.CHAR_INPUT, Const.CHAR_INPUT)

        self._tokenize(data_bundle)
        input_field_names = [Const.CHAR_INPUT]
        target_field_names = []

        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(
                lambda chars: _word_lens_to_relay(map(len, chars)),
                field_name=Const.CHAR_INPUT,
                new_field_name=Const.TARGET)
            dataset.apply_field(
                lambda chars: _word_lens_to_start_seg_mask(map(len, chars)),
                field_name=Const.CHAR_INPUT,
                new_field_name='start_seg_mask')
            dataset.apply_field(
                lambda chars: _word_lens_to_end_seg_mask(map(len, chars)),
                field_name=Const.CHAR_INPUT,
                new_field_name='end_seg_mask')
            dataset.apply_field(lambda chars: list(chain(*chars)),
                                field_name=Const.CHAR_INPUT,
                                new_field_name=Const.CHAR_INPUT)
            target_field_names.append('start_seg_mask')
            input_field_names.append('end_seg_mask')
        if self.bigrams:
            for name, dataset in data_bundle.datasets.items():
                dataset.apply_field(
                    lambda chars:
                    [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                    field_name=Const.CHAR_INPUT,
                    new_field_name='bigrams')
            input_field_names.append('bigrams')

        _indexize(data_bundle, ['chars', 'bigrams'], [])

        func = partial(_clip_target, L=self.L)
        for name, dataset in data_bundle.datasets.items():
            res = dataset.apply_field(func, field_name='target')
            relay_target = [res_i[0] for res_i in res]
            relay_mask = [res_i[1] for res_i in res]
            dataset.add_field('relay_target',
                              relay_target,
                              is_input=True,
                              is_target=False,
                              ignore_type=False)
            dataset.add_field('relay_mask',
                              relay_mask,
                              is_input=True,
                              is_target=False,
                              ignore_type=False)
            input_field_names.append('relay_target')
            input_field_names.append('relay_mask')

        input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
        target_fields = [Const.TARGET, Const.INPUT_LEN] + target_field_names
        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.CHAR_INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Esempio n. 21
0
                    dev_dataset,
                    test_dataset,
                    field_name='words')

target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab.from_dataset(train_dataset,
                          field_name='target',
                          no_create_entry_dataset=[dev_dataset, test_dataset])
target_vocab.index_dataset(train_dataset,
                           dev_dataset,
                           test_dataset,
                           field_name='target')
'''build bundle'''
data_dict = {"train": train_dataset, "dev": dev_dataset, "test": test_dataset}
vocab_dict = {"words": vocab, "target": target_vocab}
data_bundle = DataBundle(vocab_dict, data_dict)
print(data_bundle)
'''build model'''
embed = BertEmbedding(data_bundle.get_vocab('words'),
                      model_dir_or_name='en-base-uncased',
                      include_cls_sep=True)
model = BertForSequenceClassification(embed,
                                      len(data_bundle.get_vocab('target')))
# model = BertForSequenceClassification(embed, 2)

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'),
                  model,
                  optimizer=Adam(model_params=model.parameters(), lr=2e-5),
                  loss=CrossEntropyLoss(target='target'),
                  device=device,