def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True,
                    char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, train_dataset_rate=1.0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}'
    train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}')
    dev_path = os.path.join(path, 'dev.char.bmoes')
    test_path = os.path.join(path, 'test.char.bmoes')

    print(f"load train dataset: {train_path}")
    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))
    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01,
                                         min_freq=char_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01,
                                           min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
def load_toy_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_path = os.path.join(path, 'toy_train.bmoes')
    dev_path = os.path.join(path, 'toy_dev.bmoes')
    test_path = os.path.join(path, 'toy_test.bmoes')

    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))
    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    vocabs['label'] = label_vocab
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, )
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True,
                   char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01, train_dataset_rate=1.0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    # bundle = loader.load(path)
    #
    # datasets = bundle.datasets

    # print(datasets['train'][:5])

    train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}'
    train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}')
    print(f"load train dataset: {train_path}")
    dev_path = os.path.join(path, 'dev.char.bmoes')
    test_path = os.path.join(path, 'test.char.bmoes')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    datasets = {}

    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path,
                                            word_dropout=char_word_dropout,
                                            min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path,
                                           word_dropout=0.01,
                                           min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    print(f"train: {len(datasets['train'])}; dev: {len(datasets['dev'])}; test: {len(datasets['test'])}")
    return datasets, vocabs, embeddings
def equip_chinese_ner_with_lexicon(datasets,
                                   vocabs,
                                   embeddings,
                                   w_list,
                                   word_embedding_path=None,
                                   only_lexicon_in_train=False,
                                   word_char_mix_embedding_path=None,
                                   number_normalized=False,
                                   lattice_min_freq=1,
                                   only_train_min_freq=0):
    from fastNLP.core import Vocabulary

    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if only_lexicon_in_train:
        print('已支持只加载在trian中出现过的词汇')

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    # from fastNLP.embeddings import StaticEmbedding
    from fastNLP_module import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    if only_lexicon_in_train:
        lexicon_in_train = set()
        for s in datasets['train']['chars']:
            lexicon_in_s = w_trie.get_lexicon(s)
            for s, e, lexicon in lexicon_in_s:
                lexicon_in_train.add(''.join(lexicon))

        print('lexicon in train:{}'.format(len(lexicon_in_train)))
        print('i.e.: {}'.format(list(lexicon_in_train)[:10]))
        w_trie = Trie()
        for w in lexicon_in_train:
            w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars',
                      'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons',
                      'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons',
                      'lex_e')

    if number_normalized == 1:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    # print(list(datasets['train'][:10]['lexicons']))
    # print(list(datasets['train'][:10]['lattice']))
    # print(list(datasets['train'][:10]['lex_s']))
    # print(list(datasets['train'][:10]['lex_e']))
    # print(list(datasets['train'][:10]['pos_s']))
    # print(list(datasets['train'][:10]['pos_e']))
    # exit(1208)

    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab

    lattice_vocab = Vocabulary()
    lattice_vocab.from_dataset(datasets['train'],
                               field_name='lattice',
                               no_create_entry_dataset=[
                                   v for k, v in datasets.items()
                                   if k != 'train'
                               ])
    vocabs['lattice'] = lattice_vocab
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word')
    #
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count')
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_l2r_word',new_field_name='skips_l2r_word')
    #
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back')
    #
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_r2l_word',new_field_name='skips_r2l_word')

    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab,
                                         word_embedding_path,
                                         word_dropout=0)
        embeddings['word'] = word_embedding

    if word_char_mix_embedding_path is not None:
        lattice_embedding = StaticEmbedding(
            lattice_vocab,
            word_char_mix_embedding_path,
            word_dropout=0.01,
            min_freq=lattice_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['lattice'] = lattice_embedding

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice',
                                    new_field_name='lattice')
    vocabs['span_label'].index_dataset(*(datasets.values()),
                                       field_name='span_label',
                                       new_field_name='span_label')
    vocabs['attr_label'].index_dataset(*(datasets.values()),
                                       field_name='attr_start_label',
                                       new_field_name='attr_start_label')
    vocabs['attr_label'].index_dataset(*(datasets.values()),
                                       field_name='attr_end_label',
                                       new_field_name='attr_end_label')

    return datasets, vocabs, embeddings
def equip_chinese_ner_with_skip(datasets, vocabs, embeddings, w_list, word_embedding_path=None,
                                word_min_freq=1, only_train_min_freq=0):
    from utils_ import Trie, get_skip_path
    from functools import partial
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    # for k,v in datasets.items():
    #     v.apply_field(partial(get_skip_path,w_trie=w_trie),'chars','skips')

    def skips2skips_l2r(chars, w_trie):
        '''

        :param lexicons: list[[int,int,str]]
        :return: skips_l2r
        '''
        # print(lexicons)
        # print('******')

        lexicons = get_skip_path(chars, w_trie=w_trie)

        # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0

        result = [[] for _ in range(len(chars))]

        for lex in lexicons:
            s = lex[0]
            e = lex[1]
            w = lex[2]

            result[e].append([s, w])

        return result

    def skips2skips_r2l(chars, w_trie):
        '''

        :param lexicons: list[[int,int,str]]
        :return: skips_l2r
        '''
        # print(lexicons)
        # print('******')

        lexicons = get_skip_path(chars, w_trie=w_trie)

        # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0

        result = [[] for _ in range(len(chars))]

        for lex in lexicons:
            s = lex[0]
            e = lex[1]
            w = lex[2]

            result[s].append([e, w])

        return result

    for k, v in datasets.items():
        v.apply_field(partial(skips2skips_l2r, w_trie=w_trie), 'chars', 'skips_l2r')

    for k, v in datasets.items():
        v.apply_field(partial(skips2skips_r2l, w_trie=w_trie), 'chars', 'skips_r2l')

    # print(v['skips_l2r'][0])
    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab
    for k, v in datasets.items():
        v.apply_field(lambda x: [list(map(lambda x: x[0], p)) for p in x], 'skips_l2r', 'skips_l2r_source')
        v.apply_field(lambda x: [list(map(lambda x: x[1], p)) for p in x], 'skips_l2r', 'skips_l2r_word')

    for k, v in datasets.items():
        v.apply_field(lambda x: [list(map(lambda x: x[0], p)) for p in x], 'skips_r2l', 'skips_r2l_source')
        v.apply_field(lambda x: [list(map(lambda x: x[1], p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    for k, v in datasets.items():
        v.apply_field(lambda x: list(map(len, x)), 'skips_l2r_word', 'lexicon_count')
        v.apply_field(lambda x:
                      list(map(lambda y:
                               list(map(lambda z: word_vocab.to_index(z), y)), x)),
                      'skips_l2r_word', new_field_name='skips_l2r_word')

        v.apply_field(lambda x: list(map(len, x)), 'skips_r2l_word', 'lexicon_count_back')

        v.apply_field(lambda x:
                      list(map(lambda y:
                               list(map(lambda z: word_vocab.to_index(z), y)), x)),
                      'skips_r2l_word', new_field_name='skips_r2l_word')

    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0)
        embeddings['word'] = word_embedding

    vocabs['char'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
    vocabs['bigram'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
    vocabs['label'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    return datasets, vocabs, embeddings
Esempio n. 6
0
def load_msra_ner_1(path,
                    char_embedding_path=None,
                    bigram_embedding_path=None,
                    index_token=True,
                    train_clip=False,
                    char_min_freq=1,
                    bigram_min_freq=1,
                    only_train_min_freq=0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams
    if train_clip:
        train_path = os.path.join(path, 'train_dev.char.bmes_clip1')
        test_path = os.path.join(path, 'test.char.bmes_clip1')
    else:
        train_path = os.path.join(path, 'train_dev.char.bmes')
        test_path = os.path.join(path, 'test.char.bmes')

    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams,
                                  field_name='chars',
                                  new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams,
                                 field_name='chars',
                                 new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    print(datasets.keys())
    # print(len(datasets['dev']))
    print(len(datasets['test']))
    print(len(datasets['train']))
    char_vocab.from_dataset(datasets['train'],
                            field_name='chars',
                            no_create_entry_dataset=[datasets['test']])
    bigram_vocab.from_dataset(datasets['train'],
                              field_name='bigrams',
                              no_create_entry_dataset=[datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'],
                                 datasets['test'],
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'],
                                   datasets['test'],
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'],
                                  datasets['test'],
                                  field_name='target',
                                  new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    vocabs['label'] = label_vocab

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(
            char_vocab,
            char_embedding_path,
            word_dropout=0.01,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Esempio n. 7
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   char_min_freq=1,
                   bigram_min_freq=1,
                   only_train_min_freq=0,
                   char_word_dropout=0.01):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])

    train_path = os.path.join(path, 'weiboNER_2nd_conll.train')
    dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev')
    test_path = os.path.join(path, 'weiboNER_2nd_conll.test')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    datasets = {}

    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if index_token:
        char_vocab.index_dataset(*list(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            char_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=char_word_dropout,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq,
        )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Esempio n. 8
0
def equip_chinese_ner_with_lexicon(
        datasets,
        vocabs,
        embeddings,
        w_list,
        word_embedding_path=None,
        only_lexicon_in_train=False,
        word_char_mix_embedding_path=None,  # 字和词的embedding信息
        number_normalized=False,
        lattice_min_freq=1,
        only_train_min_freq=0):
    from fastNLP.core import Vocabulary

    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if only_lexicon_in_train:
        print('已支持只加载在trian中出现过的词汇')

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    from fastNLP_module import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    if only_lexicon_in_train:
        lexicon_in_train = set()
        for s in datasets['train']['chars']:
            lexicon_in_s = w_trie.get_lexicon(s)
            for s, e, lexicon in lexicon_in_s:
                lexicon_in_train.add(''.join(lexicon))

        print('lexicon in train:{}'.format(len(lexicon_in_train)))
        print('i.e.: {}'.format(list(lexicon_in_train)[:10]))
        w_trie = Trie()
        for w in lexicon_in_train:
            w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars',
                      'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons',
                      'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons',
                      'lex_e')

    if number_normalized == 1:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab

    lattice_vocab = Vocabulary()
    lattice_vocab.from_dataset(datasets['train'],
                               field_name='lattice',
                               no_create_entry_dataset=[
                                   v for k, v in datasets.items()
                                   if k != 'train'
                               ])
    vocabs['lattice'] = lattice_vocab
    """
    1.word_embedding_path 这个参数到底是用做什么的?
    我将其设置成了 None。但是如果为None,那么embedding['word']没有了还可以吗?
    
    2.StaticEmbedding:
    给定预训练embedding的名称或路径,根据vocab从embedding中抽取相应的数据(只会将出现在vocab中的词抽取出来, 如果没有找到,则会随机初始化一个值(但如果该word是被标记为no_create_entry的话,则不会单独创建一个值,而是会被指向unk的index))
    """
    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab,
                                         word_embedding_path,
                                         word_dropout=0)
        embeddings['word'] = word_embedding

    if word_char_mix_embedding_path is not None:
        lattice_embedding = StaticEmbedding(
            lattice_vocab,
            word_char_mix_embedding_path,
            word_dropout=0.01,
            min_freq=lattice_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['lattice'] = lattice_embedding

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice',
                                    new_field_name='lattice')

    return datasets, vocabs, embeddings
Esempio n. 9
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   char_min_freq=1,
                   bigram_min_freq=1,
                   only_train_min_freq=0,
                   char_word_dropout=0.01):
    """
    加载微博语料,并缓存数据
    :param path: 微博数据集的路径,文件夹
    :param unigram_embedding_path: 如果存在,那么使用unigram embedding
    :param bigram_embedding_path: 如果存在,使用bigram embedding,可以和unigram同时使用
    :param index_token:
    :param char_min_freq:
    :param bigram_min_freq:
    :param only_train_min_freq:
    :param char_word_dropout:
    :return:
    """
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams
    #设定数据集的headers是chars和target,即读取的第一列是chars,第二列是target
    loader = ConllLoader(headers=['chars', 'target'])
    # bundle = loader.load(path)
    #
    # datasets = bundle.datasets

    # print(datasets['train'][:5])

    train_path = os.path.join(path, 'train.conll')
    dev_path = os.path.join(path, 'dev.conll')
    test_path = os.path.join(path, 'test.conll')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    datasets = {}

    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    print("样本集的样本个数信息如下:")
    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            char_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=char_word_dropout,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq,
        )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Esempio n. 10
0
def load_tianchi_ner(
        path,
        unigram_embedding_path=None,  # yangjie_rich_pretrain_unigram_path
        bigram_embedding_path=None,  # yangjie_rich_pretrain_bigram_path
        index_token=True,
        char_min_freq=1,
        bigram_min_freq=1,
        only_train_min_freq=0,
        char_word_dropout=0.01):

    # step0.=============================准备数据,诸如数据地址等
    loader = ConllLoader(['chars', 'target'])
    train_path = os.path.join(path, 'tianchi.train')
    dev_path = os.path.join(path, 'tianchi.dev')
    test_path = os.path.join(path, 'tianchi.test')

    paths = {}
    paths['dev'] = dev_path
    paths['train'] = train_path
    paths['test'] = test_path

    # step1.=============================构建datasets
    datasets = {}  # 字典!!! 但是需要注意的是:datasets 中的每一项都是一个 DataSet 类的实例
    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))

    #step2.=============================根据得到的dataset构建字典信息
    vocabs = {}
    # 需要学习一下 Vocabulary 的使用方法
    # urL:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    # datasets 就3个键值对,分别是 train:[] , dev:[], test:[]
    for item in datasets.items():
        print(item)

    for k, v in datasets.items():  # 处理键值对
        # ignore the word segmentation tag
        # apply_field() 方法是fastNLP 中的一个处理DataSet 实例的方法
        # 传入得chars 参数是干什么的?这是形参filed_name 和 new_field_name 的两个值,这表明没有对列名进行修改,即不扩增列
        # 同理,第二个(get_bigrams,'chars','bigrams') 是根据 chars 这个列的值,新建bigrams这一列
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars',
                      'bigrams')  # 感觉这里的效果就是将连续的两个字拼在一起,也就是所谓的 bigrams

    # datasets['train']是一个DataSet 的实例
    # 形参no_create_entry_dataset的作用:在建立词表的时候将test与dev就考虑到模型中,这会使得最终的结果更好
    # 根据训练数据构建字典信息
    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    #char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=datasets['dev'])
    #bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=datasets['dev'])
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # 将使用len()直接对field_name中每个元素作用,将其结果作为sequence length, 并放入new_field_name=seq_len这个field
        v.add_seq_len('chars', new_field_name='seq_len')

    # 是否将dataset中的每列转为字典中的index
    # 我对 *list(datasets.values()) 这个不是很熟悉
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')

    # vocabs 的构造和 datasets 的构造原理都是相同的
    # 二者都是字典,不同的键值对应着不同的数据信息
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab

    # step3.=============================构建embedding信息
    '''有如下几个问题:
    01.不是说预训练的embedding 会失去上下文的语义信息吗?为什么这里又用embedding了?
    02.这个embedding 和后面的bertEmbedding 有什么区别?
    03.需要学习一下 StaticEmbedding()的作用
    '''
    embeddings = {}
    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            char_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=char_word_dropout,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq)
        # 这里的 unigram_embedding 就是一个实例
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Esempio n. 11
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   char_min_freq=1,
                   bigram_min_freq=1,
                   only_train_min_freq=0,
                   char_word_dropout=0.01):

    # step0.=============================准备数据,诸如数据地址等
    loader = ConllLoader(['chars', 'target'])
    train_path = os.path.join(path, 'weiboNER_2nd_conll.train_deseg')
    dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev_deseg')
    test_path = os.path.join(path, 'weiboNER_2nd_conll.test_deseg')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    # step1.=============================构建datasets
    datasets = {}  # 字典!!! 但是需要注意的是:datasets 中的每一项都是一个(fastNLP)中 DataSet 类的实例
    for k, v in paths.items():
        bundle = loader.load(v)
        # 这里有点儿疑问,为什么是固定的 'train' 作为参数?
        # 固定的 train 为参数,是因为bundle 这个实例的设置,它是把数据都放到 train 这个里面了
        datasets[k] = bundle.datasets['train']

    trainData = datasets['train']

    print(type(trainData))  # <class 'fastNLP.core.dataset.DataSet'>
    print(len(trainData))  # 1350
    print(trainData)
    """
    datasets['train'] 中的数据长成下面这样,
        +-----------------------------------------------------------+-----------------------------------------------------------+
        | chars                                                     | target                                                    |
        +-----------------------------------------------------------+-----------------------------------------------------------+
        | ['科', '技', '全', '方', '位', '资', '讯', '智', '能',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['对', ',', '输', '给', '一', '个', '女', '人', ',',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM... |
        | ['今', '天', '下', '午', '起', '来', '看', '到', '外',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['今', '年', '拜', '年', '不', '短', '信', ',', '就',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['浑', '身', '酸', '疼', ',', '两', '腿', '无', '力',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['明', '显', '紧', '张', '状', '态', '没', '出', '来',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['三', '十', '年', '前', ',', '老', '爹', '带', '我',...  | ['O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM', 'O... |
        | ['好', '活', '动', '呀', ',', '给', '力', '的', '商',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['人', '生', '如', '戏', ',', '导', '演', '是', '自',...  | ['O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM', 'O... |
        | ['听', '说', '小', '米', '开', '卖', '了', ',', '刚',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ...                                                       | ...                                                       |
        +-----------------------------------------------------------+-----------------------------------------------------------+

        这个是 复旦大学开源工具fastNLP 中DataSet 的类型,其详细文档可参考:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_1_data_preprocess.html

    """

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))

    #step2.=============================根据得到的dataset构建字典信息
    vocabs = {}
    # 需要学习一下 Vocabulary 的使用方法
    # urL:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    # datasets 就3个键值对,分别是 train:[] , dev:[], test:[]
    for item in datasets.items():
        print(item)

    for k, v in datasets.items():  # 处理键值对
        # ignore the word segmentation tag
        # apply_field() 方法是fastNLP 中的一个处理DataSet 实例的方法
        # 传入得chars 参数是干什么的?这是形参filed_name 和 new_field_name 的两个值,这表明没有对列名进行修改,即不扩增列
        # 同理,第二个(get_bigrams,'chars','bigrams') 是根据 chars 这个列的值,新建bigrams这一列
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars',
                      'bigrams')  # 感觉这里的效果就是将连续的两个字拼在一起,也就是所谓的 bigrams

    # datasets['train']是一个DataSet 的实例
    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')

    # vocabs 的构造和 datasets 的构造原理都是相同的
    # 二者都是字典,不同的键值对应着不同的数据信息
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab

    # step3.=============================构建embedding信息
    embeddings = {}
    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            char_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=char_word_dropout,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq,
        )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Esempio n. 12
0
def load_aicup_ner(
    path,
    unigram_embedding_path=None,
    bigram_embedding_path=None,
    char_word_dropout=0.01,
    only_train_min_freq=0,
    bigram_min_freq=1,
    data_type='default',
    index_token=True,
    char_min_freq=1,
    cv=False,
    fold=0,
    ):
    vocabs = {}
    embeddings = {}

    train_path = os.path.join(path, f'fold{fold}', f'train/{data_type}')
    dev_path = os.path.join(path, f'fold{fold}', f'dev/{data_type}')
    print('-----------------Dataset---------------------')
    print('loading data from', train_path,'\nand', dev_path)
    loader = ConllLoader(['chars', 'target'])

    train = loader.load(train_path)
    dev = loader.load(dev_path)
 
    ds = {
        'train':train.datasets['train'],
        'dev':dev.datasets['train'],
    }
    ds['aicup_dev'] = get_aicup_devds()
    
    # jieba.enable_paddle()   
    for ds_name in ds.keys():
        ds[ds_name].apply_field(get_bigrams, 'chars', 'bigrams')
        ds[ds_name].add_seq_len('chars', new_field_name='seq_len')

        ds[ds_name].apply_field(get_pos_tag, 'chars', 'pos_tag')

    for k, v in ds.items():
        print('{}:{}'.format(k, len(v)))

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    pos_vocab = Vocabulary()

    label_vocab = get_label_vocab(data_type)
    pos_vocab.from_dataset(*list(ds.values()), field_name='pos_tag')

    if cv: no_create_entry_ds = [ds['dev'], ds['aicup_dev']]
    else: no_create_entry_ds = [ds['dev'], ds['test'], ds['aicup_dev']]
        
    char_vocab.from_dataset(
        ds['train'],
        field_name='chars',
        no_create_entry_dataset=no_create_entry_ds
    )
    bigram_vocab.from_dataset(
        ds['train'],
        field_name='bigrams',
        no_create_entry_dataset=no_create_entry_ds
    )
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    vocabs['pos_tag'] = pos_vocab
    
    if index_token:
        char_vocab.index_dataset(*list(ds.values()), field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(*list(ds.values()),field_name='bigrams',new_field_name='bigrams')
        label_vocab.index_dataset(*list([ds['train'], ds['dev']]), field_name='target', new_field_name='target')
    
    pos_vocab.index_dataset(*list(ds.values()),field_name='pos_tag', new_field_name='pos_tag')
    
    unigram_embedding = StaticEmbedding(
        char_vocab, 
        model_dir_or_name=unigram_embedding_path,                          
        word_dropout=char_word_dropout,                                      
        min_freq=char_min_freq,
        only_train_min_freq=only_train_min_freq,
    )
    bigram_embedding = StaticEmbedding(
        bigram_vocab,
        model_dir_or_name=bigram_embedding_path,
        word_dropout=0.01,
        min_freq=bigram_min_freq,
        only_train_min_freq=only_train_min_freq,
    )
    embeddings['char'] = unigram_embedding
    embeddings['bigram'] = bigram_embedding
    print(ds['train'])
    print(set([ele[0].split('-')[1] if ele[0]!='O' and ele[0][0]!='<' else ele[0] for ele in list(label_vocab)]))
    print('------------------------------------------')
    return ds, vocabs, embeddings