Esempio n. 1
0
 def test_elmo_embedding_layer_assertion(self):
     vocab = Vocabulary().add_word_lst("This is a test .".split())
     try:
         elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_elmo',
                                    layers='0,1,2')
     except AssertionError as e:
         print(e)
Esempio n. 2
0
 def test_elmo_embedding(self):
     vocab = Vocabulary().add_word_lst("This is a test .".split())
     elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_elmo', layers='0,1')
     words = torch.LongTensor([[0, 1, 2]])
     hidden = elmo_embed(words)
     print(hidden.size())
     self.assertEqual(hidden.size(), (1, 3, elmo_embed.embedding_dim))
Esempio n. 3
0
def load_data():

    paths = {
        "train": "../data/{}/train.txt".format(dataset),
        "test": "../data/{}/test.txt".format(dataset),
        "dev": "../data/{}/dev.txt".format(dataset)
    }
    data = WNUT_17NERPipe(encoding_type=encoding_type).process_from_file(paths)

    dict_save_path = os.path.join("../data/{}/data.pth".format(dataset))
    context_dict, context_word2id, context_id2word = get_neighbor_for_vocab(
        data.get_vocab('words').word2idx, glove_path, dict_save_path)

    train_feature_data, dev_feature_data, test_feature_data = build_instances(
        "../data/{}".format(dataset), context_num, context_dict)

    data.rename_field('words', 'chars')
    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name=elmo_model,
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()
    bert_embed = BertEmbedding(vocab=data.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)
    embed = StackEmbedding([embed, bert_embed], dropout=0, word_dropout=0.02)

    return data, embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
 def test_download_small(self):
     # import os
     vocab = Vocabulary().add_word_lst("This is a test .".split())
     elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='en-small')
     words = torch.LongTensor([[0, 1, 2]])
     print(elmo_embed(words).size())
Esempio n. 5
0
def load_data():
    # 替换路径
    if dataset == 'conll2003':
        # conll2003的lr不能超过0.002
        paths = {
            'test': "../data/conll2003/test.txt",
            'train': "../data/conll2003/train.txt",
            'dev': "../data/conll2003/dev.txt"
        }
        data = Conll2003NERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    elif dataset == 'en-ontonotes':
        paths = '../data/en-ontonotes/english'
        data = OntoNotesNERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    char_embed = None
    if char_type == 'cnn':
        char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'),
                                      embed_size=30,
                                      char_emb_size=30,
                                      filter_nums=[30],
                                      kernel_sizes=[3],
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      include_word_start_end=False,
                                      min_char_freq=2)
    elif char_type in ['adatrans', 'naive']:
        char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'),
                                          embed_size=30,
                                          char_emb_size=30,
                                          word_dropout=0,
                                          dropout=0.3,
                                          pool_method='max',
                                          activation='relu',
                                          min_char_freq=2,
                                          requires_grad=True,
                                          include_word_start_end=False,
                                          char_attn_type=char_type,
                                          char_n_head=3,
                                          char_dim_ffn=60,
                                          char_scale=char_type == 'naive',
                                          char_dropout=0.15,
                                          char_after_norm=True)
    elif char_type == 'lstm':
        char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'),
                                       embed_size=30,
                                       char_emb_size=30,
                                       word_dropout=0,
                                       dropout=0.3,
                                       hidden_size=100,
                                       pool_method='max',
                                       activation='relu',
                                       min_char_freq=2,
                                       bidirectional=True,
                                       requires_grad=True,
                                       include_word_start_end=False)
    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    data.rename_field('words', 'chars')

    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name='en-original',
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()

    embed = StackEmbedding([embed, word_embed, char_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data, embed
Esempio n. 6
0
                           tokenizer=arg.tokenizer).process_from_file()
elif arg.task == 'mnli':
    data_bundle = MNLIPipe(lower=arg.to_lower,
                           tokenizer=arg.tokenizer).process_from_file()
elif arg.task == 'quora':
    data_bundle = QuoraPipe(lower=arg.to_lower,
                            tokenizer=arg.tokenizer).process_from_file()
else:
    raise RuntimeError(f'NOT support {arg.task} task yet!')

print(data_bundle)  # print details in data_bundle

# load embedding
if arg.embedding == 'elmo':
    embedding = ElmoEmbedding(data_bundle.vocabs[Const.INPUTS(0)],
                              model_dir_or_name='en-medium',
                              requires_grad=True)
elif arg.embedding == 'glove':
    embedding = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)],
                                model_dir_or_name='en-glove-840b-300d',
                                requires_grad=True,
                                normalize=False)
else:
    raise RuntimeError(f'NOT support {arg.embedding} embedding yet!')

# define model
model = ESIM(embedding, num_labels=len(data_bundle.vocabs[Const.TARGET]))

# define optimizer and callback
optimizer = Adamax(lr=arg.lr, params=model.parameters())
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)  # 每10个epoch学习率变为原来的0.5倍
Esempio n. 7
0
def load_data():
    if dataset == 'ON5e':
        paths = 'data/ON5e/english'
        data = OntoNotesNERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    else:
        paths = {
            "train": "data/{}/train.txt".format(dataset),
            "dev": "data/{}/dev.txt".format(dataset),
            "test": "data/{}/test.txt".format(dataset)
        }
        data = ENNERPipe(encoding_type=encoding_type).process_from_file(paths)

    if knowledge:
        train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api(
            os.path.join("data", dataset), "all", feature_level)
    else:
        train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = None, None, None, None, None, None

    char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'),
                                      embed_size=embed_size,
                                      char_emb_size=embed_size,
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      activation='relu',
                                      min_char_freq=2,
                                      requires_grad=True,
                                      include_word_start_end=False,
                                      char_attn_type=char_type,
                                      char_n_head=3,
                                      char_dim_ffn=60,
                                      char_scale=char_type == 'naive',
                                      char_dropout=0.15,
                                      char_after_norm=True)

    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    data.rename_field('words', 'chars')

    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name=elmo_model,
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()

    bert_embed = BertEmbedding(vocab=data.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method="first",
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, bert_embed, word_embed, char_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data, embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature