Example #1
0
def load_data():
    # 替换路径
    if dataset == 'conll2003':
        # conll2003的lr不能超过0.002
        paths = {
            'test': "../data/conll2003/test.txt",
            'train': "../data/conll2003/train.txt",
            'dev': "../data/conll2003/dev.txt"
        }
        data = Conll2003NERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    elif dataset == 'en-ontonotes':
        paths = '../data/en-ontonotes/english'
        data = OntoNotesNERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    char_embed = None
    if char_type == 'cnn':
        char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'),
                                      embed_size=30,
                                      char_emb_size=30,
                                      filter_nums=[30],
                                      kernel_sizes=[3],
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      include_word_start_end=False,
                                      min_char_freq=2)
    elif char_type in ['adatrans', 'naive']:
        char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'),
                                          embed_size=30,
                                          char_emb_size=30,
                                          word_dropout=0,
                                          dropout=0.3,
                                          pool_method='max',
                                          activation='relu',
                                          min_char_freq=2,
                                          requires_grad=True,
                                          include_word_start_end=False,
                                          char_attn_type=char_type,
                                          char_n_head=3,
                                          char_dim_ffn=60,
                                          char_scale=char_type == 'naive',
                                          char_dropout=0.15,
                                          char_after_norm=True)
    elif char_type == 'lstm':
        char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'),
                                       embed_size=30,
                                       char_emb_size=30,
                                       word_dropout=0,
                                       dropout=0.3,
                                       hidden_size=100,
                                       pool_method='max',
                                       activation='relu',
                                       min_char_freq=2,
                                       bidirectional=True,
                                       requires_grad=True,
                                       include_word_start_end=False)
    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    data.rename_field('words', 'chars')

    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name='en-original',
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()

    embed = StackEmbedding([embed, word_embed, char_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data, embed
Example #2
0
def load_data():
    if dataset == 'ON5e':
        paths = 'data/ON5e/english'
        data = OntoNotesNERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    else:
        paths = {
            "train": "data/{}/train.txt".format(dataset),
            "dev": "data/{}/dev.txt".format(dataset),
            "test": "data/{}/test.txt".format(dataset)
        }
        data = ENNERPipe(encoding_type=encoding_type).process_from_file(paths)

    if knowledge:
        train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api(
            os.path.join("data", dataset), "all", feature_level)
    else:
        train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = None, None, None, None, None, None

    char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'),
                                      embed_size=embed_size,
                                      char_emb_size=embed_size,
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      activation='relu',
                                      min_char_freq=2,
                                      requires_grad=True,
                                      include_word_start_end=False,
                                      char_attn_type=char_type,
                                      char_n_head=3,
                                      char_dim_ffn=60,
                                      char_scale=char_type == 'naive',
                                      char_dropout=0.15,
                                      char_after_norm=True)

    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    data.rename_field('words', 'chars')

    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name=elmo_model,
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()

    bert_embed = BertEmbedding(vocab=data.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method="first",
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, bert_embed, word_embed, char_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data, embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature