Ejemplo n.º 1
0
def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True,
                    char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, train_dataset_rate=1.0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}'
    train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}')
    dev_path = os.path.join(path, 'dev.char.bmoes')
    test_path = os.path.join(path, 'test.char.bmoes')

    print(f"load train dataset: {train_path}")
    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))
    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01,
                                         min_freq=char_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01,
                                           min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Ejemplo n.º 2
0
def load_toy_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_path = os.path.join(path, 'toy_train.bmoes')
    dev_path = os.path.join(path, 'toy_dev.bmoes')
    test_path = os.path.join(path, 'toy_test.bmoes')

    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))
    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    vocabs['label'] = label_vocab
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, )
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Ejemplo n.º 3
0
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True,
                   char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01, train_dataset_rate=1.0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    # bundle = loader.load(path)
    #
    # datasets = bundle.datasets

    # print(datasets['train'][:5])

    train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}'
    train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}')
    print(f"load train dataset: {train_path}")
    dev_path = os.path.join(path, 'dev.char.bmoes')
    test_path = os.path.join(path, 'test.char.bmoes')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    datasets = {}

    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path,
                                            word_dropout=char_word_dropout,
                                            min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path,
                                           word_dropout=0.01,
                                           min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    print(f"train: {len(datasets['train'])}; dev: {len(datasets['dev'])}; test: {len(datasets['test'])}")
    return datasets, vocabs, embeddings
Ejemplo n.º 4
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   normlize={
                       'char': True,
                       'bigram': True,
                       'word': False
                   }):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    bundle = loader.load(path)

    datasets = bundle.datasets
    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    word_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    word_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = word_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        word_vocab.index_dataset(*list(datasets.values()),
                                 field_name='raw_words',
                                 new_field_name='words')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='raw_bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='raw_target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            word_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['char'])
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Ejemplo n.º 5
0
def load_resume_ner(path,
                    char_embedding_path=None,
                    bigram_embedding_path=None,
                    index_token=True,
                    normalize={
                        'char': True,
                        'bigram': True,
                        'word': False
                    }):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_path = os.path.join(path, 'train.char.bmes')
    dev_path = os.path.join(path, 'dev.char.bmes')
    test_path = os.path.join(path, 'test.char.bmes')

    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams,
                                  field_name='chars',
                                  new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams,
                                field_name='chars',
                                new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams,
                                 field_name='chars',
                                 new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)
    print(datasets.keys())
    print(len(datasets['dev']))
    print(len(datasets['test']))
    print(len(datasets['train']))
    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'],
                                 datasets['dev'],
                                 datasets['test'],
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'],
                                   datasets['dev'],
                                   datasets['test'],
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'],
                                  datasets['dev'],
                                  datasets['test'],
                                  field_name='target',
                                  new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab,
                                         char_embedding_path,
                                         word_dropout=0.01,
                                         normalize=normalize['char'])
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab,
                                           bigram_embedding_path,
                                           word_dropout=0.01,
                                           normalize=normalize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Ejemplo n.º 6
0
def load_conllized_ontonote_POS(path, embedding_path=None):
    from fastNLP.io.loader import ConllLoader
    header2index = {'words': 3, 'POS': 4, 'NER': 10}
    headers = ['words', 'POS']

    if 'NER' in headers:
        print(
            '警告!通过 load_conllized_ontonote 函数读出来的NER标签不是BIOS,是纯粹的conll格式,是错误的!'
        )
    indexes = list(map(lambda x: header2index[x], headers))

    loader = ConllLoader(headers, indexes)

    bundle = loader.load(path)

    # print(bundle.datasets)

    train_set = bundle.datasets['train']
    dev_set = bundle.datasets['dev']
    test_set = bundle.datasets['test']

    # train_set = loader.load(os.path.join(path,'train.txt'))
    # dev_set = loader.load(os.path.join(path, 'dev.txt'))
    # test_set = loader.load(os.path.join(path, 'test.txt'))

    # print(len(train_set))

    train_set.add_seq_len('words', 'seq_len')
    dev_set.add_seq_len('words', 'seq_len')
    test_set.add_seq_len('words', 'seq_len')

    # print(dataset['POS'])

    vocab = Vocabulary(min_freq=1)
    vocab.from_dataset(train_set, field_name='words')
    vocab.from_dataset(dev_set, field_name='words')
    vocab.from_dataset(test_set, field_name='words')

    vocab.index_dataset(train_set, field_name='words')
    vocab.index_dataset(dev_set, field_name='words')
    vocab.index_dataset(test_set, field_name='words')

    label_vocab_dict = {}

    for i, h in enumerate(headers):
        if h == 'words':
            continue
        label_vocab_dict[h] = Vocabulary(min_freq=1,
                                         padding=None,
                                         unknown=None)
        label_vocab_dict[h].from_dataset(train_set, field_name=h)

        label_vocab_dict[h].index_dataset(train_set, field_name=h)
        label_vocab_dict[h].index_dataset(dev_set, field_name=h)
        label_vocab_dict[h].index_dataset(test_set, field_name=h)

    train_set.set_input(Const.INPUT, Const.INPUT_LEN)
    train_set.set_target(headers[1])

    dev_set.set_input(Const.INPUT, Const.INPUT_LEN)
    dev_set.set_target(headers[1])

    test_set.set_input(Const.INPUT, Const.INPUT_LEN)
    test_set.set_target(headers[1])

    if len(headers) > 2:
        print('警告:由于任务数量大于1,所以需要每次手动设置target!')

    print('train:', len(train_set), 'dev:', len(dev_set), 'test:',
          len(test_set))

    if embedding_path is not None:
        pretrained_embedding = load_word_emb(embedding_path, 300, vocab)
        return (train_set, dev_set,
                test_set), (vocab, label_vocab_dict), pretrained_embedding
    else:
        return (train_set, dev_set, test_set), (vocab, label_vocab_dict)
Ejemplo n.º 7
0
def load_weibo_ner_old(path,
                       unigram_embedding_path=None,
                       bigram_embedding_path=None,
                       index_token=True,
                       normlize={
                           'char': True,
                           'bigram': True,
                           'word': False
                       }):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    # from fastNLP.io.file_reader import _read_conll
    # from fastNLP.core import Instance,DataSet
    # def _load(path):
    #     ds = DataSet()
    #     for idx, data in _read_conll(path, indexes=loader.indexes, dropna=loader.dropna,
    #                                 encoding='ISO-8859-1'):
    #         ins = {h: data[i] for i, h in enumerate(loader.headers)}
    #         ds.append(Instance(**ins))
    #     return ds
    # from fastNLP.io.utils import check_loader_paths
    # paths = check_loader_paths(path)
    # datasets = {name: _load(path) for name, path in paths.items()}
    datasets = {}
    train_path = os.path.join(path, 'train.all.bmes')
    dev_path = os.path.join(path, 'dev.all.bmes')
    test_path = os.path.join(path, 'test.all.bmes')
    datasets['train'] = loader.load(train_path).datasets['train']
    datasets['dev'] = loader.load(dev_path).datasets['train']
    datasets['test'] = loader.load(test_path).datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))

    vocabs = {}
    word_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    word_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = word_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        word_vocab.index_dataset(*list(datasets.values()),
                                 field_name='raw_words',
                                 new_field_name='words')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='raw_bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='raw_target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            word_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['char'])
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Ejemplo n.º 8
0
def load_msra_ner_1(path,
                    char_embedding_path=None,
                    bigram_embedding_path=None,
                    index_token=True,
                    train_clip=False,
                    char_min_freq=1,
                    bigram_min_freq=1,
                    only_train_min_freq=0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams
    if train_clip:
        train_path = os.path.join(path, 'train_dev.char.bmes_clip1')
        test_path = os.path.join(path, 'test.char.bmes_clip1')
    else:
        train_path = os.path.join(path, 'train_dev.char.bmes')
        test_path = os.path.join(path, 'test.char.bmes')

    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams,
                                  field_name='chars',
                                  new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams,
                                 field_name='chars',
                                 new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    print(datasets.keys())
    # print(len(datasets['dev']))
    print(len(datasets['test']))
    print(len(datasets['train']))
    char_vocab.from_dataset(datasets['train'],
                            field_name='chars',
                            no_create_entry_dataset=[datasets['test']])
    bigram_vocab.from_dataset(datasets['train'],
                              field_name='bigrams',
                              no_create_entry_dataset=[datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'],
                                 datasets['test'],
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'],
                                   datasets['test'],
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'],
                                  datasets['test'],
                                  field_name='target',
                                  new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    vocabs['label'] = label_vocab

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(
            char_vocab,
            char_embedding_path,
            word_dropout=0.01,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Ejemplo n.º 9
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   char_min_freq=1,
                   bigram_min_freq=1,
                   only_train_min_freq=0,
                   char_word_dropout=0.01):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])

    train_path = os.path.join(path, 'weiboNER_2nd_conll.train')
    dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev')
    test_path = os.path.join(path, 'weiboNER_2nd_conll.test')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    datasets = {}

    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if index_token:
        char_vocab.index_dataset(*list(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            char_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=char_word_dropout,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq,
        )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Ejemplo n.º 10
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   char_min_freq=1,
                   bigram_min_freq=1,
                   only_train_min_freq=0,
                   char_word_dropout=0.01):
    """
    加载微博语料,并缓存数据
    :param path: 微博数据集的路径,文件夹
    :param unigram_embedding_path: 如果存在,那么使用unigram embedding
    :param bigram_embedding_path: 如果存在,使用bigram embedding,可以和unigram同时使用
    :param index_token:
    :param char_min_freq:
    :param bigram_min_freq:
    :param only_train_min_freq:
    :param char_word_dropout:
    :return:
    """
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams
    #设定数据集的headers是chars和target,即读取的第一列是chars,第二列是target
    loader = ConllLoader(headers=['chars', 'target'])
    # bundle = loader.load(path)
    #
    # datasets = bundle.datasets

    # print(datasets['train'][:5])

    train_path = os.path.join(path, 'train.conll')
    dev_path = os.path.join(path, 'dev.conll')
    test_path = os.path.join(path, 'test.conll')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    datasets = {}

    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    print("样本集的样本个数信息如下:")
    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            char_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=char_word_dropout,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq,
        )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Ejemplo n.º 11
0
def load_resume_ner(path,char_embedding_path=None,bigram_embedding_path=None,index_token=True,
                    char_min_freq=1,bigram_min_freq=1,only_train_min_freq=0):

    
    # print('*' * 40)
    # print(bigram_embedding_path)

    train_path = os.path.join(path,'train.char.bmes')
    dev_path = os.path.join(path,'dev.char.bmes')
    test_path = os.path.join(path,'test.char.bmes')

    loader = ConllLoader(['chars','target'])
    # | ['爱', '财', '不', '分', '古', '今', ',', '除... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',... |
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)


    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']


    datasets['train'].apply_field(get_bigrams,field_name='chars',new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')



    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    print(datasets.keys())
    print(len(datasets['dev']))
    print(len(datasets['test']))
    print(len(datasets['train']))
    char_vocab.from_dataset(datasets['train'],field_name='chars',
                            no_create_entry_dataset=[datasets['dev'],datasets['test']] )
    bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'],datasets['test']])
    label_vocab.from_dataset(datasets['train'],field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
                                 field_name='chars',new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
                                 field_name='bigrams',new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'],
                                 field_name='target',new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab,char_embedding_path,word_dropout=0.01,
                                         min_freq=char_min_freq,only_train_min_freq=only_train_min_freq)
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab,bigram_embedding_path,word_dropout=0.01,
                                           min_freq=bigram_min_freq,only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding
        
    # embeddings["bigram"] = char_embedding

    return datasets,vocabs,embeddings
Ejemplo n.º 12
0
def load_tianchi_ner(
        path,
        unigram_embedding_path=None,  # yangjie_rich_pretrain_unigram_path
        bigram_embedding_path=None,  # yangjie_rich_pretrain_bigram_path
        index_token=True,
        char_min_freq=1,
        bigram_min_freq=1,
        only_train_min_freq=0,
        char_word_dropout=0.01):

    # step0.=============================准备数据,诸如数据地址等
    loader = ConllLoader(['chars', 'target'])
    train_path = os.path.join(path, 'tianchi.train')
    dev_path = os.path.join(path, 'tianchi.dev')
    test_path = os.path.join(path, 'tianchi.test')

    paths = {}
    paths['dev'] = dev_path
    paths['train'] = train_path
    paths['test'] = test_path

    # step1.=============================构建datasets
    datasets = {}  # 字典!!! 但是需要注意的是:datasets 中的每一项都是一个 DataSet 类的实例
    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))

    #step2.=============================根据得到的dataset构建字典信息
    vocabs = {}
    # 需要学习一下 Vocabulary 的使用方法
    # urL:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    # datasets 就3个键值对,分别是 train:[] , dev:[], test:[]
    for item in datasets.items():
        print(item)

    for k, v in datasets.items():  # 处理键值对
        # ignore the word segmentation tag
        # apply_field() 方法是fastNLP 中的一个处理DataSet 实例的方法
        # 传入得chars 参数是干什么的?这是形参filed_name 和 new_field_name 的两个值,这表明没有对列名进行修改,即不扩增列
        # 同理,第二个(get_bigrams,'chars','bigrams') 是根据 chars 这个列的值,新建bigrams这一列
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars',
                      'bigrams')  # 感觉这里的效果就是将连续的两个字拼在一起,也就是所谓的 bigrams

    # datasets['train']是一个DataSet 的实例
    # 形参no_create_entry_dataset的作用:在建立词表的时候将test与dev就考虑到模型中,这会使得最终的结果更好
    # 根据训练数据构建字典信息
    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    #char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=datasets['dev'])
    #bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=datasets['dev'])
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # 将使用len()直接对field_name中每个元素作用,将其结果作为sequence length, 并放入new_field_name=seq_len这个field
        v.add_seq_len('chars', new_field_name='seq_len')

    # 是否将dataset中的每列转为字典中的index
    # 我对 *list(datasets.values()) 这个不是很熟悉
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')

    # vocabs 的构造和 datasets 的构造原理都是相同的
    # 二者都是字典,不同的键值对应着不同的数据信息
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab

    # step3.=============================构建embedding信息
    '''有如下几个问题:
    01.不是说预训练的embedding 会失去上下文的语义信息吗?为什么这里又用embedding了?
    02.这个embedding 和后面的bertEmbedding 有什么区别?
    03.需要学习一下 StaticEmbedding()的作用
    '''
    embeddings = {}
    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            char_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=char_word_dropout,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq)
        # 这里的 unigram_embedding 就是一个实例
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Ejemplo n.º 13
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   char_min_freq=1,
                   bigram_min_freq=1,
                   only_train_min_freq=0,
                   char_word_dropout=0.01):

    # step0.=============================准备数据,诸如数据地址等
    loader = ConllLoader(['chars', 'target'])
    train_path = os.path.join(path, 'weiboNER_2nd_conll.train_deseg')
    dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev_deseg')
    test_path = os.path.join(path, 'weiboNER_2nd_conll.test_deseg')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    # step1.=============================构建datasets
    datasets = {}  # 字典!!! 但是需要注意的是:datasets 中的每一项都是一个(fastNLP)中 DataSet 类的实例
    for k, v in paths.items():
        bundle = loader.load(v)
        # 这里有点儿疑问,为什么是固定的 'train' 作为参数?
        # 固定的 train 为参数,是因为bundle 这个实例的设置,它是把数据都放到 train 这个里面了
        datasets[k] = bundle.datasets['train']

    trainData = datasets['train']

    print(type(trainData))  # <class 'fastNLP.core.dataset.DataSet'>
    print(len(trainData))  # 1350
    print(trainData)
    """
    datasets['train'] 中的数据长成下面这样,
        +-----------------------------------------------------------+-----------------------------------------------------------+
        | chars                                                     | target                                                    |
        +-----------------------------------------------------------+-----------------------------------------------------------+
        | ['科', '技', '全', '方', '位', '资', '讯', '智', '能',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['对', ',', '输', '给', '一', '个', '女', '人', ',',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM... |
        | ['今', '天', '下', '午', '起', '来', '看', '到', '外',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['今', '年', '拜', '年', '不', '短', '信', ',', '就',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['浑', '身', '酸', '疼', ',', '两', '腿', '无', '力',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['明', '显', '紧', '张', '状', '态', '没', '出', '来',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['三', '十', '年', '前', ',', '老', '爹', '带', '我',...  | ['O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM', 'O... |
        | ['好', '活', '动', '呀', ',', '给', '力', '的', '商',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ['人', '生', '如', '戏', ',', '导', '演', '是', '自',...  | ['O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM', 'O... |
        | ['听', '说', '小', '米', '开', '卖', '了', ',', '刚',...  | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... |
        | ...                                                       | ...                                                       |
        +-----------------------------------------------------------+-----------------------------------------------------------+

        这个是 复旦大学开源工具fastNLP 中DataSet 的类型,其详细文档可参考:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_1_data_preprocess.html

    """

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))

    #step2.=============================根据得到的dataset构建字典信息
    vocabs = {}
    # 需要学习一下 Vocabulary 的使用方法
    # urL:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    # datasets 就3个键值对,分别是 train:[] , dev:[], test:[]
    for item in datasets.items():
        print(item)

    for k, v in datasets.items():  # 处理键值对
        # ignore the word segmentation tag
        # apply_field() 方法是fastNLP 中的一个处理DataSet 实例的方法
        # 传入得chars 参数是干什么的?这是形参filed_name 和 new_field_name 的两个值,这表明没有对列名进行修改,即不扩增列
        # 同理,第二个(get_bigrams,'chars','bigrams') 是根据 chars 这个列的值,新建bigrams这一列
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars',
                      'bigrams')  # 感觉这里的效果就是将连续的两个字拼在一起,也就是所谓的 bigrams

    # datasets['train']是一个DataSet 的实例
    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')

    # vocabs 的构造和 datasets 的构造原理都是相同的
    # 二者都是字典,不同的键值对应着不同的数据信息
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab

    # step3.=============================构建embedding信息
    embeddings = {}
    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            char_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=char_word_dropout,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq,
        )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Ejemplo n.º 14
0
def load_aicup_ner(
    path,
    unigram_embedding_path=None,
    bigram_embedding_path=None,
    char_word_dropout=0.01,
    only_train_min_freq=0,
    bigram_min_freq=1,
    data_type='default',
    index_token=True,
    char_min_freq=1,
    cv=False,
    fold=0,
    ):
    vocabs = {}
    embeddings = {}

    train_path = os.path.join(path, f'fold{fold}', f'train/{data_type}')
    dev_path = os.path.join(path, f'fold{fold}', f'dev/{data_type}')
    print('-----------------Dataset---------------------')
    print('loading data from', train_path,'\nand', dev_path)
    loader = ConllLoader(['chars', 'target'])

    train = loader.load(train_path)
    dev = loader.load(dev_path)
 
    ds = {
        'train':train.datasets['train'],
        'dev':dev.datasets['train'],
    }
    ds['aicup_dev'] = get_aicup_devds()
    
    # jieba.enable_paddle()   
    for ds_name in ds.keys():
        ds[ds_name].apply_field(get_bigrams, 'chars', 'bigrams')
        ds[ds_name].add_seq_len('chars', new_field_name='seq_len')

        ds[ds_name].apply_field(get_pos_tag, 'chars', 'pos_tag')

    for k, v in ds.items():
        print('{}:{}'.format(k, len(v)))

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    pos_vocab = Vocabulary()

    label_vocab = get_label_vocab(data_type)
    pos_vocab.from_dataset(*list(ds.values()), field_name='pos_tag')

    if cv: no_create_entry_ds = [ds['dev'], ds['aicup_dev']]
    else: no_create_entry_ds = [ds['dev'], ds['test'], ds['aicup_dev']]
        
    char_vocab.from_dataset(
        ds['train'],
        field_name='chars',
        no_create_entry_dataset=no_create_entry_ds
    )
    bigram_vocab.from_dataset(
        ds['train'],
        field_name='bigrams',
        no_create_entry_dataset=no_create_entry_ds
    )
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    vocabs['pos_tag'] = pos_vocab
    
    if index_token:
        char_vocab.index_dataset(*list(ds.values()), field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(*list(ds.values()),field_name='bigrams',new_field_name='bigrams')
        label_vocab.index_dataset(*list([ds['train'], ds['dev']]), field_name='target', new_field_name='target')
    
    pos_vocab.index_dataset(*list(ds.values()),field_name='pos_tag', new_field_name='pos_tag')
    
    unigram_embedding = StaticEmbedding(
        char_vocab, 
        model_dir_or_name=unigram_embedding_path,                          
        word_dropout=char_word_dropout,                                      
        min_freq=char_min_freq,
        only_train_min_freq=only_train_min_freq,
    )
    bigram_embedding = StaticEmbedding(
        bigram_vocab,
        model_dir_or_name=bigram_embedding_path,
        word_dropout=0.01,
        min_freq=bigram_min_freq,
        only_train_min_freq=only_train_min_freq,
    )
    embeddings['char'] = unigram_embedding
    embeddings['bigram'] = bigram_embedding
    print(ds['train'])
    print(set([ele[0].split('-')[1] if ele[0]!='O' and ele[0][0]!='<' else ele[0] for ele in list(label_vocab)]))
    print('------------------------------------------')
    return ds, vocabs, embeddings
Ejemplo n.º 15
0
        print(sen)
    assert len(sen) == len(pos_tag)
    return pos_tag
    

if __name__ == "__main__":
    
    label = get_label_vocab(data_type='default')
    print(label)
    exit()
    
    dev_path = os.path.join(aicup_ner_path, f'fold{0}', f'dev/number')
    print('-----------------Dataset---------------------')
    loader = ConllLoader(['chars', 'target'])

    dev = loader.load(dev_path)
    print(dev.datasets['train'])
    # label_vocab = get_label_vocab('number)
    # print(label_vocab)
    # print(list(label_vocab))
    # pass
    # ds, vocabs, embeddings = load_aicup_ner(
    #     aicup_ner_path,
    #     yangjie_rich_pretrain_unigram_path,
    #     yangjie_rich_pretrain_bigram_path,
    #     index_token=False,
    #     char_min_freq=1,
    #     bigram_min_freq=1,
    #     only_train_min_freq=True,
    #     char_word_dropout=0.01,
    #     cv=True,
Ejemplo n.º 16
0
def load_ner(path,unigram_embedding_path=None,bigram_embedding_path=None,index_token=True, train_path=None, dev_path=None,
            char_min_freq=1,bigram_min_freq=1,only_train_min_freq=0,char_word_dropout=0.01, test_path=None, \
            logger=None, with_placeholder=True, placeholder_path=None, with_test_a=False, test_a_path=None, label_word2idx=None, \
            **kwargs):

    loader = ConllLoader(['chars','target'])

    # train_path = os.path.join(path,'weiboNER_2nd_conll.train_deseg')
    # dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev_deseg')
    # test_path = os.path.join(path, 'weiboNER_2nd_conll.test_deseg')

    if train_path is None:
        train_path = '/ai/223/person/lichunyu/datasets/dataf/seq_label/seq_label.train'
    if dev_path is None:
        dev_path = '/ai/223/person/lichunyu/datasets/dataf/seq_label/seq_label.test'

    # train_path = '/ai/223/person/lichunyu/datasets/dataf/seq_label/seq_label_all_all.train'
    # dev_path = '/ai/223/person/lichunyu/datasets/dataf/seq_label/seq_label_test_a_labeled.train'

    # train_path = '/ai/223/person/lichunyu/datasets/dataf/test/test_A_text.seq'
    # dev_path = '/ai/223/person/lichunyu/datasets/dataf/test/test_A_text.seq'
    # test_path = '/ai/223/person/lichunyu/datasets/tmp/test_one.txt'
    if test_path is None:
        test_path = '/ai/223/person/lichunyu/datasets/dataf/test/test_B_final_text.nonletter'

    if placeholder_path is None:
        placeholder_path = '/root/all_train.test'

    if test_a_path is None:
        test_a_path = '/ai/223/person/lichunyu/datasets/df-competition/df-511/test/test_A_text.seq'

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path
    paths['placeholder'] = placeholder_path
    paths['test_a'] = test_a_path

    datasets = {}

    for k,v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k,v in datasets.items():
        if logger is not None:
            logger.info('{}:{}'.format(k,len(v)))
        else:
            print('{}:{}'.format(k,len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    for k,v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x],'chars','chars')
        v.apply_field(get_bigrams,'chars','bigrams')


    # char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'],datasets['test']])
    if with_placeholder is True and with_test_a is False:
        char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'], datasets['placeholder']])
    elif with_placeholder is True and with_test_a is True:
        char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'], datasets['placeholder'], datasets['test_a']])
    else:
        char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev']])
    label_vocab.from_dataset(datasets['train'],field_name='target')
    if label_word2idx is not None:
        label_vocab.word2idx = label_word2idx
    if logger is not None:
        logger.info('label_vocab:{}\n{}'.format(len(label_vocab),label_vocab.idx2word))

    for k,v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars',new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    # bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'],datasets['test']])
    if with_placeholder is True and with_test_a is False:
        bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'], datasets['placeholder']])
    elif with_placeholder is True and with_test_a is True:
        bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'], datasets['placeholder'], datasets['test_a']])
        print('dataset create with test_a')
    else:
        bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev']])
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),field_name='bigrams',new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path,
                                            word_dropout=char_word_dropout,
                                            min_freq=char_min_freq,only_train_min_freq=only_train_min_freq,)
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path,
                                           word_dropout=0.01,
                                           min_freq=bigram_min_freq,only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings