Beispiel #1
0
def load_seqtag(path, files, indexs):
    word_h, tag_h = 'words', 'tags'
    loader = ConllLoader(headers=[word_h, tag_h], indexes=indexs)
    ds_list = []
    for fn in files:
        ds_list.append(loader.load(os.path.join(path, fn)))
    word_v = Vocabulary(min_freq=2)
    tag_v = Vocabulary(unknown=None)
    update_v(word_v, ds_list[0], word_h)
    update_v(tag_v, ds_list[0], tag_h)

    def process_data(ds):
        to_index(word_v, ds, word_h, C.INPUT)
        to_index(tag_v, ds, tag_h, C.TARGET)
        ds.apply(lambda x: x[C.INPUT][:MAX_LEN], new_field_name=C.INPUT)
        ds.apply(lambda x: x[C.TARGET][:MAX_LEN], new_field_name=C.TARGET)
        ds.apply(lambda x: len(x[word_h]), new_field_name=C.INPUT_LEN)
        ds.set_input(C.INPUT, C.INPUT_LEN)
        ds.set_target(C.TARGET, C.INPUT_LEN)

    for i in range(len(ds_list)):
        process_data(ds_list[i])
    return ds_list, word_v, tag_v
Beispiel #2
0
def load_conllized_ontonote_POS(path,embedding_path=None):
    from fastNLP.io.data_loader import ConllLoader
    header2index = {'words':3,'POS':4,'NER':10}
    headers = ['words','POS']

    if 'NER' in headers:
        print('警告!通过 load_conllized_ontonote 函数读出来的NER标签不是BIOS,是纯粹的conll格式,是错误的!')
    indexes = list(map(lambda x:header2index[x],headers))

    loader = ConllLoader(headers,indexes)

    bundle = loader.load(path)

    # print(bundle.datasets)

    train_set = bundle.datasets['train']
    dev_set = bundle.datasets['dev']
    test_set = bundle.datasets['test']




    # train_set = loader.load(os.path.join(path,'train.txt'))
    # dev_set = loader.load(os.path.join(path, 'dev.txt'))
    # test_set = loader.load(os.path.join(path, 'test.txt'))

    # print(len(train_set))

    train_set.add_seq_len('words','seq_len')
    dev_set.add_seq_len('words','seq_len')
    test_set.add_seq_len('words','seq_len')



    # print(dataset['POS'])

    vocab = Vocabulary(min_freq=1)
    vocab.from_dataset(train_set,field_name='words')
    vocab.from_dataset(dev_set, field_name='words')
    vocab.from_dataset(test_set, field_name='words')

    vocab.index_dataset(train_set,field_name='words')
    vocab.index_dataset(dev_set, field_name='words')
    vocab.index_dataset(test_set, field_name='words')




    label_vocab_dict = {}

    for i,h in enumerate(headers):
        if h == 'words':
            continue
        label_vocab_dict[h] = Vocabulary(min_freq=1,padding=None,unknown=None)
        label_vocab_dict[h].from_dataset(train_set,field_name=h)

        label_vocab_dict[h].index_dataset(train_set,field_name=h)
        label_vocab_dict[h].index_dataset(dev_set,field_name=h)
        label_vocab_dict[h].index_dataset(test_set,field_name=h)

    train_set.set_input(Const.INPUT, Const.INPUT_LEN)
    train_set.set_target(headers[1])

    dev_set.set_input(Const.INPUT, Const.INPUT_LEN)
    dev_set.set_target(headers[1])

    test_set.set_input(Const.INPUT, Const.INPUT_LEN)
    test_set.set_target(headers[1])

    if len(headers) > 2:
        print('警告:由于任务数量大于1,所以需要每次手动设置target!')


    print('train:',len(train_set),'dev:',len(dev_set),'test:',len(test_set))

    if embedding_path is not None:
        pretrained_embedding = load_word_emb(embedding_path, 300, vocab)
        return (train_set,dev_set,test_set),(vocab,label_vocab_dict),pretrained_embedding
    else:
        return (train_set, dev_set, test_set), (vocab, label_vocab_dict)
Beispiel #3
0
def load_resume_ner(path,char_embedding_path=None,bigram_embedding_path=None,index_token=True,
                    normalize={'char':True,'bigram':True,'word':False}):
    from fastNLP.io.data_loader import ConllLoader
    from utils import get_bigrams

    train_path = os.path.join(path,'train.char.bmes')
    dev_path = os.path.join(path,'dev.char.bmes')
    test_path = os.path.join(path,'test.char.bmes')

    loader = ConllLoader(['chars','target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)


    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']


    datasets['train'].apply_field(get_bigrams,field_name='chars',new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')



    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None,unknown=None)
    print(datasets.keys())
    print(len(datasets['dev']))
    print(len(datasets['test']))
    print(len(datasets['train']))
    char_vocab.from_dataset(datasets['train'],field_name='chars',
                            no_create_entry_dataset=[datasets['dev'],datasets['test']] )
    bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'],datasets['test']])
    label_vocab.from_dataset(datasets['train'],field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
                                 field_name='chars',new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
                                 field_name='bigrams',new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
                                 field_name='target',new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab,char_embedding_path,word_dropout=0.01,normalize=normalize['char'])
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab,bigram_embedding_path,word_dropout=0.01,normalize=normalize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets,vocabs,embeddings
Beispiel #4
0
def load_weibo_ner(path,unigram_embedding_path=None,bigram_embedding_path=None,index_token=True,
                   normlize={'char':True,'bigram':True,'word':False}):
    from fastNLP.io.data_loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars','target'])
    bundle = loader.load(path)

    datasets = bundle.datasets
    for k,v in datasets.items():
        print('{}:{}'.format(k,len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    word_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None,unknown=None)

    for k,v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x],'chars','chars')
        v.apply_field(get_bigrams,'chars','bigrams')


    word_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'],datasets['test']])
    label_vocab.from_dataset(datasets['train'],field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab),label_vocab.idx2word))


    for k,v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars',new_field_name='seq_len')


    vocabs['char'] = word_vocab
    vocabs['label'] = label_vocab


    bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'],datasets['test']])
    if index_token:
        word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words')
        bigram_vocab.index_dataset(*list(datasets.values()),field_name='raw_bigrams',new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(word_vocab, model_dir_or_name=unigram_embedding_path,
                                            word_dropout=0.01,normalize=normlize['char'])
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path,
                                           word_dropout=0.01,normalize=normlize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Beispiel #5
0
 def __init__(self):
     self._loader = ConllLoader(headers=['words', 'pos_tags', 'heads', 'labels'], indexes=[1, 3, 6, 7])
Beispiel #6
0
class CTBxJointLoader(DataSetLoader):
    """
    文件夹下应该具有以下的文件结构
        -train.conllx
        -dev.conllx
        -test.conllx
    每个文件中的内容如下(空格隔开不同的句子, 共有)
        1	费孝通	_	NR	NR	_	3	nsubjpass	_	_
        2	被	_	SB	SB	_	3	pass	_	_
        3	授予	_	VV	VV	_	0	root	_	_
        4	麦格赛赛	_	NR	NR	_	5	nn	_	_
        5	奖	_	NN	NN	_	3	dobj	_	_

        1	新华社	_	NR	NR	_	7	dep	_	_
        2	马尼拉	_	NR	NR	_	7	dep	_	_
        3	8月	_	NT	NT	_	7	dep	_	_
        4	31日	_	NT	NT	_	7	dep	_	_
        ...

    """
    def __init__(self):
        self._loader = ConllLoader(headers=['words', 'pos_tags', 'heads', 'labels'], indexes=[1, 3, 6, 7])

    def load(self, path:str):
        """
        给定一个文件路径,将数据读取为DataSet格式。DataSet中包含以下的内容
        words: list[str]
        pos_tags: list[str]
        heads: list[int]
        labels: list[str]

        :param path:
        :return:
        """
        dataset = self._loader.load(path)
        dataset.heads.int()
        return dataset
    
    def process(self, paths):
        """
        
        :param paths: 
        :return:
            Dataset包含以下的field
                chars:
                bigrams:
                trigrams:
                pre_chars:
                pre_bigrams:
                pre_trigrams:
                seg_targets:
                seg_masks:
                seq_lens:
                char_labels:
                char_heads:
                gold_word_pairs:
                seg_targets:
                seg_masks:
                char_labels:
                char_heads:
                pun_masks:
                gold_label_word_pairs:
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()

        for name, path in paths.items():
            dataset = self.load(path)
            data.datasets[name] = dataset

        char_labels_vocab = Vocabulary(padding=None, unknown=None)

        def process(dataset, char_label_vocab):
            dataset.apply(add_word_lst, new_field_name='word_lst')
            dataset.apply(lambda x: list(chain(*x['word_lst'])), new_field_name='chars')
            dataset.apply(add_bigram, field_name='chars', new_field_name='bigrams')
            dataset.apply(add_trigram, field_name='chars', new_field_name='trigrams')
            dataset.apply(add_char_heads, new_field_name='char_heads')
            dataset.apply(add_char_labels, new_field_name='char_labels')
            dataset.apply(add_segs, new_field_name='seg_targets')
            dataset.apply(add_mask, new_field_name='seg_masks')
            dataset.add_seq_len('chars', new_field_name='seq_lens')
            dataset.apply(add_pun_masks, new_field_name='pun_masks')
            if len(char_label_vocab.word_count)==0:
                char_label_vocab.from_dataset(dataset, field_name='char_labels')
            char_label_vocab.index_dataset(dataset, field_name='char_labels')
            new_dataset = add_root(dataset)
            new_dataset.apply(add_word_pairs, new_field_name='gold_word_pairs', ignore_type=True)
            global add_label_word_pairs
            add_label_word_pairs = partial(add_label_word_pairs, label_vocab=char_label_vocab)
            new_dataset.apply(add_label_word_pairs, new_field_name='gold_label_word_pairs', ignore_type=True)

            new_dataset.set_pad_val('char_labels', -1)
            new_dataset.set_pad_val('char_heads', -1)

            return new_dataset

        for name in list(paths.keys()):
            dataset = data.datasets[name]
            dataset = process(dataset, char_labels_vocab)
            data.datasets[name] = dataset

        data.vocabs['char_labels'] = char_labels_vocab

        char_vocab = Vocabulary(min_freq=2).from_dataset(data.datasets['train'], field_name='chars')
        bigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='bigrams')
        trigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='trigrams')

        for name in ['chars', 'bigrams', 'trigrams']:
            vocab = Vocabulary().from_dataset(field_name=name, no_create_entry_dataset=list(data.datasets.values()))
            vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name='pre_' + name)
            data.vocabs['pre_{}'.format(name)] = vocab

        for name, vocab in zip(['chars', 'bigrams', 'trigrams'],
                        [char_vocab, bigram_vocab, trigram_vocab]):
            vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name=name)
            data.vocabs[name] = vocab

        for name, dataset in data.datasets.items():
            dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens', 'char_labels', 'char_heads', 'pre_chars',
                                  'pre_bigrams', 'pre_trigrams')
            dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets', 'seg_masks', 'char_labels',
                                   'char_heads',
                                   'pun_masks', 'gold_label_word_pairs')

        return data
Beispiel #7
0
def load_weibo_ner_old(path,
                       unigram_embedding_path=None,
                       bigram_embedding_path=None,
                       index_token=True,
                       normlize={
                           'char': True,
                           'bigram': True,
                           'word': False
                       }):
    from fastNLP.io.data_loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    # from fastNLP.io.file_reader import _read_conll
    # from fastNLP.core import Instance,DataSet
    # def _load(path):
    #     ds = DataSet()
    #     for idx, data in _read_conll(path, indexes=loader.indexes, dropna=loader.dropna,
    #                                 encoding='ISO-8859-1'):
    #         ins = {h: data[i] for i, h in enumerate(loader.headers)}
    #         ds.append(Instance(**ins))
    #     return ds
    # from fastNLP.io.utils import check_loader_paths
    # paths = check_loader_paths(path)
    # datasets = {name: _load(path) for name, path in paths.items()}
    datasets = {}
    train_path = os.path.join(path, 'train.all.bmes')
    dev_path = os.path.join(path, 'dev.all.bmes')
    test_path = os.path.join(path, 'test.all.bmes')
    datasets['train'] = loader.load(train_path).datasets['train']
    datasets['dev'] = loader.load(dev_path).datasets['train']
    datasets['test'] = loader.load(test_path).datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))

    vocabs = {}
    word_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    word_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = word_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        word_vocab.index_dataset(*list(datasets.values()),
                                 field_name='raw_words',
                                 new_field_name='words')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='raw_bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='raw_target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            word_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['char'])
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings