def test_elmo_embedding_layer_assertion(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) try: elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_elmo', layers='0,1,2') except AssertionError as e: print(e)
def test_elmo_embedding(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_elmo', layers='0,1') words = torch.LongTensor([[0, 1, 2]]) hidden = elmo_embed(words) print(hidden.size()) self.assertEqual(hidden.size(), (1, 3, elmo_embed.embedding_dim))
def load_data(): paths = { "train": "../data/{}/train.txt".format(dataset), "test": "../data/{}/test.txt".format(dataset), "dev": "../data/{}/dev.txt".format(dataset) } data = WNUT_17NERPipe(encoding_type=encoding_type).process_from_file(paths) dict_save_path = os.path.join("../data/{}/data.pth".format(dataset)) context_dict, context_word2id, context_id2word = get_neighbor_for_vocab( data.get_vocab('words').word2idx, glove_path, dict_save_path) train_feature_data, dev_feature_data, test_feature_data = build_instances( "../data/{}".format(dataset), context_num, context_dict) data.rename_field('words', 'chars') embed = ElmoEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=elmo_model, layers='mix', requires_grad=False, word_dropout=0.0, dropout=0.5, cache_word_reprs=False) embed.set_mix_weights_requires_grad() bert_embed = BertEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, bert_embed], dropout=0, word_dropout=0.02) return data, embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
def test_download_small(self): # import os vocab = Vocabulary().add_word_lst("This is a test .".split()) elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='en-small') words = torch.LongTensor([[0, 1, 2]]) print(elmo_embed(words).size())
def load_data(): # 替换路径 if dataset == 'conll2003': # conll2003的lr不能超过0.002 paths = { 'test': "../data/conll2003/test.txt", 'train': "../data/conll2003/train.txt", 'dev': "../data/conll2003/dev.txt" } data = Conll2003NERPipe( encoding_type=encoding_type).process_from_file(paths) elif dataset == 'en-ontonotes': paths = '../data/en-ontonotes/english' data = OntoNotesNERPipe( encoding_type=encoding_type).process_from_file(paths) char_embed = None if char_type == 'cnn': char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3], word_dropout=0, dropout=0.3, pool_method='max', include_word_start_end=False, min_char_freq=2) elif char_type in ['adatrans', 'naive']: char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, pool_method='max', activation='relu', min_char_freq=2, requires_grad=True, include_word_start_end=False, char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type == 'naive', char_dropout=0.15, char_after_norm=True) elif char_type == 'lstm': char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, hidden_size=100, pool_method='max', activation='relu', min_char_freq=2, bidirectional=True, requires_grad=True, include_word_start_end=False) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) data.rename_field('words', 'chars') embed = ElmoEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name='en-original', layers='mix', requires_grad=False, word_dropout=0.0, dropout=0.5, cache_word_reprs=False) embed.set_mix_weights_requires_grad() embed = StackEmbedding([embed, word_embed, char_embed], dropout=0, word_dropout=0.02) return data, embed
tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'mnli': data_bundle = MNLIPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'quora': data_bundle = QuoraPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() else: raise RuntimeError(f'NOT support {arg.task} task yet!') print(data_bundle) # print details in data_bundle # load embedding if arg.embedding == 'elmo': embedding = ElmoEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-medium', requires_grad=True) elif arg.embedding == 'glove': embedding = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-glove-840b-300d', requires_grad=True, normalize=False) else: raise RuntimeError(f'NOT support {arg.embedding} embedding yet!') # define model model = ESIM(embedding, num_labels=len(data_bundle.vocabs[Const.TARGET])) # define optimizer and callback optimizer = Adamax(lr=arg.lr, params=model.parameters()) scheduler = StepLR(optimizer, step_size=10, gamma=0.5) # 每10个epoch学习率变为原来的0.5倍
def load_data(): if dataset == 'ON5e': paths = 'data/ON5e/english' data = OntoNotesNERPipe( encoding_type=encoding_type).process_from_file(paths) else: paths = { "train": "data/{}/train.txt".format(dataset), "dev": "data/{}/dev.txt".format(dataset), "test": "data/{}/test.txt".format(dataset) } data = ENNERPipe(encoding_type=encoding_type).process_from_file(paths) if knowledge: train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api( os.path.join("data", dataset), "all", feature_level) else: train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = None, None, None, None, None, None char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=embed_size, char_emb_size=embed_size, word_dropout=0, dropout=0.3, pool_method='max', activation='relu', min_char_freq=2, requires_grad=True, include_word_start_end=False, char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type == 'naive', char_dropout=0.15, char_after_norm=True) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) data.rename_field('words', 'chars') embed = ElmoEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=elmo_model, layers='mix', requires_grad=False, word_dropout=0.0, dropout=0.5, cache_word_reprs=False) embed.set_mix_weights_requires_grad() bert_embed = BertEmbedding(vocab=data.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method="first", word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, bert_embed, word_embed, char_embed], dropout=0, word_dropout=0.02) return data, embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature