def test_same_vector3(self): # 验证lower word_lst = ["The", "the"] no_create_word_lst = ['of', 'Of', 'With', 'with'] vocab = Vocabulary().add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor( [[vocab.to_index(word) for word in word_lst + no_create_word_lst]]) words = embed(words) lowered_word_lst = [word.lower() for word in word_lst] lowered_no_create_word_lst = [ word.lower() for word in no_create_word_lst ] lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) lowered_words = torch.LongTensor([[ lowered_vocab.to_index(word) for word in lowered_word_lst + lowered_no_create_word_lst ]]) lowered_words = lowered_embed(lowered_words) all_words = word_lst + no_create_word_lst for idx, (word_i, word_j) in enumerate(zip(words[0], lowered_words[0])): with self.subTest(idx=idx, word=all_words[idx]): assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
def get_data(): data_bundle = ChineseNERLoader().process('MSRA-NER/', bigrams=True) char_embed = StaticEmbedding(data_bundle.vocabs['chars'], model_dir_or_name='cn-char') bigram_embed = StaticEmbedding(data_bundle.vocabs['bigrams'], model_dir_or_name='cn-bigram') return data_bundle, char_embed, bigram_embed
def test_same_vector4(self): # 验证在有min_freq下的lower word_lst = ["The", "the", "the", "The", "a", "A"] no_create_word_lst = ['of', 'Of', "Of", "of", 'With', 'with'] all_words = word_lst[:-2] + no_create_word_lst[:-2] vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) words = embed(words) lowered_word_lst = [word.lower() for word in word_lst] lowered_no_create_word_lst = [ word.lower() for word in no_create_word_lst ] lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) lowered_words = torch.LongTensor( [[lowered_vocab.to_index(word.lower()) for word in all_words]]) lowered_words = lowered_embed(lowered_words) for idx in range(len(all_words)): word_i, word_j = words[0, idx], lowered_words[0, idx] with self.subTest(idx=idx, word=all_words[idx]): assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
def test_same_vector5(self): # 检查通过使用min_freq后的word是否内容一致 word_lst = ["they", "the", "they", "the", 'he', 'he', "a", "A"] no_create_word_lst = ['of', "of", "she", "she", 'With', 'with'] all_words = word_lst[:-2] + no_create_word_lst[:-2] vocab = Vocabulary().add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=False, min_freq=2) words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) words = embed(words) min_freq_vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) min_freq_vocab.add_word_lst(no_create_word_lst, no_create_entry=True) min_freq_embed = StaticEmbedding(min_freq_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) min_freq_words = torch.LongTensor( [[min_freq_vocab.to_index(word.lower()) for word in all_words]]) min_freq_words = min_freq_embed(min_freq_words) for idx in range(len(all_words)): word_i, word_j = words[0, idx], min_freq_words[0, idx] with self.subTest(idx=idx, word=all_words[idx]): assert torch.sum(word_i == word_j).eq( min_freq_embed.embed_size)
def load_data(): paths = { 'train': 'data/{}/train.txt'.format(dataset), 'dev': 'data/{}/dev.txt'.format(dataset), 'test': 'data/{}/test.txt'.format(dataset) } min_freq = 2 data_bundle = CNNERPipe( bigrams=True, encoding_type=encoding_type).process_from_file(paths) dict_save_path = os.path.join("data/{}/data.pth".format(dataset)) context_dict, context_word2id, context_id2word = get_neighbor_for_vocab( data_bundle.get_vocab('chars').word2idx, glove_path, dict_save_path) train_feature_data, dev_feature_data, test_feature_data = build_instances( "data/{}".format(dataset), context_num, context_dict) embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bi_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=min_freq, only_norm_found_vector=normalize_embed, only_train_min_freq=True) tencent_embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/tencent_unigram.txt', min_freq=min_freq, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, tencent_embed, bert_embed], dropout=0, word_dropout=0.02) return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
def prepare_data(): data_bundle = CWSShiftRelayPipe(dataset_name=data_name, L=L).process_from_file() # 预训练的character embedding和bigram embedding char_embed = StaticEmbedding(data_bundle.get_vocab('chars'), dropout=0.5, word_dropout=0.01, model_dir_or_name='~/exps/CWS/pretrain/vectors/1grams_t3_m50_corpus.txt') bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), dropout=0.5, min_freq=3, word_dropout=0.01, model_dir_or_name='~/exps/CWS/pretrain/vectors/2grams_t3_m50_corpus.txt') return data_bundle, char_embed, bigram_embed
def get_data(): data_bundle = WeiboNERLoader().load() data_bundle = ChineseNERPipe(encoding_type='bioes', bigram=True).process(data_bundle) char_embed = StaticEmbedding(data_bundle.get_vocab(C.CHAR_INPUT), model_dir_or_name='cn-fasttext') bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), embedding_dim=100, min_freq=3) return data_bundle, char_embed, bigram_embed
def load_data(): paths = { 'train': 'data/{}/train.txt'.format(dataset), "dev": 'data/{}/dev.txt'.format(dataset), "test": 'data/{}/test.txt'.format(dataset) } min_freq = 1 data_bundle = CNNERPipe( bigrams=True, encoding_type=encoding_type).process_from_file(paths) train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api( os.path.join("data", dataset), "all", args.feature_level) embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) tencent_embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/tencent_unigram.txt', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bi_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=min_freq, only_norm_found_vector=normalize_embed, only_train_min_freq=True) bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=False) embed = StackEmbedding([embed, tencent_embed, bert_embed], dropout=0, word_dropout=0.02) return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature
def load_ner_data(): paths = { 'train': 'data/{}/train.txt'.format(dataset), 'dev': 'data/{}/dev.txt'.format(dataset), 'test': 'data/{}/test.txt'.format(dataset) } min_freq = 2 data_bundle = CNNERPipe( bigrams=True, encoding_type=encoding_type).process_from_file(paths) # train_list = data_bundle.get_dataset('train')['raw_chars'] embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bi_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=2, only_norm_found_vector=normalize_embed, only_train_min_freq=True) tencent_embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='data/tencent_unigram.txt', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name=args.bert_model, layers='-1', pool_method=args.pool_method, word_dropout=0, dropout=0.5, include_cls_sep=False, pooled_cls=True, requires_grad=False, auto_truncate=True) # embed = StackEmbedding([tencent_embed, bert_embed], dropout=0, word_dropout=0.02) embed = StackEmbedding([embed, tencent_embed, bert_embed], dropout=0, word_dropout=0.02) return data_bundle, embed, bi_embed
def load_data(): # 替换路径 if dataset == 'ontonotes': paths = { 'train': '../data/OntoNote4NER/train.char.bmes', "dev": '../data/OntoNote4NER/dev.char.bmes', "test": '../data/OntoNote4NER/test.char.bmes' } min_freq = 2 elif dataset == 'weibo': paths = { 'train': '../data/WeiboNER/train.all.bmes', 'dev': '../data/WeiboNER/dev.all.bmes', 'test': '../data/WeiboNER/test.all.bmes' } min_freq = 1 elif dataset == 'resume': paths = { 'train': '../data/ResumeNER/train.char.bmes', 'dev': '../data/ResumeNER/dev.char.bmes', 'test': '../data/ResumeNER/test.char.bmes' } min_freq = 1 elif dataset == 'msra': paths = { 'train': '../data/MSRANER/train_dev.char.bmes', 'dev': '../data/MSRANER/test.char.bmes', 'test': '../data/MSRANER/test.char.bmes' } min_freq = 2 data_bundle = CNNERPipe( bigrams=True, encoding_type=encoding_type).process_from_file(paths) embed = StaticEmbedding( data_bundle.get_vocab('chars'), model_dir_or_name='../data/gigaword_chn.all.a2b.uni.ite50.vec', min_freq=1, only_norm_found_vector=normalize_embed, word_dropout=0.01, dropout=0.3) bi_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), model_dir_or_name='../data/gigaword_chn.all.a2b.bi.ite50.vec', word_dropout=0.02, dropout=0.3, min_freq=min_freq, only_norm_found_vector=normalize_embed, only_train_min_freq=True) return data_bundle, embed, bi_embed
def load_data(): # paths = {'test': "../data/conll2003/test.txt", # 'train': "../data/conll2003/train.txt", # 'dev': "../data/conll2003/dev.txt"} paths = {'test': args.test, 'train': args.train, 'dev': args.dev} data = Conll2003NERPipe( encoding_type=encoding_type).process_from_file(paths) char_embed = None if char_type == 'cnn': char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3], word_dropout=0, dropout=0.3, pool_method='max', include_word_start_end=False, min_char_freq=2) elif char_type == 'lstm': char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, hidden_size=100, pool_method='max', activation='relu', min_char_freq=2, bidirectional=True, requires_grad=True, include_word_start_end=False) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) if char_embed is not None: embed = StackEmbedding([word_embed, char_embed], dropout=0, word_dropout=0.02) else: word_embed.word_drop = 0.02 embed = word_embed data.rename_field('words', 'chars') return data, embed
def test_case(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) vocab.add_word_lst("Another test !".split()) embed = StaticEmbedding(vocab, embedding_dim=10) encoder_output = torch.randn(2, 3, 10) src_seq_len = torch.LongTensor([3, 2]) encoder_mask = seq_len_to_mask(src_seq_len) for flag in [True, False]: with self.subTest(bind_decoder_input_output_embed=flag): decoder = TransformerSeq2SeqDecoder( embed=embed, pos_embed=None, d_model=10, num_layers=2, n_head=5, dim_ff=20, dropout=0.1, bind_decoder_input_output_embed=True) state = decoder.init_state(encoder_output, encoder_mask) output = decoder(tokens=torch.randint(0, len(vocab), size=(2, 4)), state=state) self.assertEqual(output.size(), (2, 4, len(vocab)))
def load_conllized_ontonote_pkl(path,embedding_path=None): data_bundle = pickle.load(open(path,'rb')) train_set = data_bundle.datasets['train'] dev_set = data_bundle.datasets['dev'] test_set = data_bundle.datasets['test'] train_set.rename_field('pos','posid') train_set.rename_field('ner','nerid') train_set.rename_field('chunk','chunkid') dev_set.rename_field('pos','posid') dev_set.rename_field('ner','nerid') dev_set.rename_field('chunk','chunkid') test_set.rename_field('pos','posid') test_set.rename_field('ner','nerid') test_set.rename_field('chunk','chunkid') word_vocab = data_bundle.vocabs['words'] pos_vocab = data_bundle.vocabs['pos'] ner_vocab = data_bundle.vocabs['ner'] chunk_vocab = data_bundle.vocabs['chunk'] if embedding_path is not None: embed = StaticEmbedding(vocab=word_vocab, model_dir_or_name=embedding_path, word_dropout=0.01, dropout=0.5,lower=True) return (train_set,dev_set,test_set),\ (word_vocab,pos_vocab,ner_vocab,chunk_vocab),embed else: return (train_set, dev_set, test_set), (word_vocab,ner_vocab)
def test_case(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) vocab.add_word_lst("Another test !".split()) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=10) encoder_output = torch.randn(2, 3, 10) tgt_words_idx = torch.LongTensor([[1, 2, 3, 4], [2, 3, 0, 0]]) src_seq_len = torch.LongTensor([3, 2]) encoder_mask = seq_len_to_mask(src_seq_len) for flag in [True, False]: for attention in [True, False]: with self.subTest(bind_decoder_input_output_embed=flag, attention=attention): decoder = LSTMSeq2SeqDecoder( embed=embed, num_layers=2, hidden_size=10, dropout=0.3, bind_decoder_input_output_embed=flag, attention=attention) state = decoder.init_state(encoder_output, encoder_mask) output = decoder(tgt_words_idx, state) self.assertEqual(tuple(output.size()), (2, 4, len(vocab)))
def test_search(self): """语义搜索.TypeError: expected dimension <= 2 array or matrix """ print('{} test_search {}'.format('-' * 15, '-' * 15)) texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基'] # 文本向量化 vocab = Vocabulary() for text in texts: vocab.add_word_lst(list(text)) print(len(vocab)) embed = StaticEmbedding( vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt') texts_to_id = [[vocab.to_index(word) for word in list(text)] for text in texts] words = torch.LongTensor(texts_to_id) # 将文本转为index features_vec = embed(words) print(features_vec.shape) # build the search index! cp = ci.MultiClusterIndex(features_vec.detach().numpy(), texts) search_texts = ['朱日和站', '温都尔站', '国电站'] for text in search_texts: texts_to_id = [[vocab.to_index(word) for word in list(text)]] words = torch.LongTensor(texts_to_id) # 将文本转为index features_vec = embed(words) search_features_vec = features_vec.detach().numpy() search_result = cp.search(search_features_vec, k=2, k_clusters=2, return_distance=True) print('text:{}'.format(text)) print('search_result:{}'.format(search_result)) """
def __init__(self, vocab, hidden_size, num_layers, n_class_per_task, dropout=0.5): super().__init__() # word_embed = nn.Embedding(len(vocab), 50) word_embed = StaticEmbedding(vocab=vocab, embedding_dim=50, word_dropout=0, dropout=dropout, lower=True) self.word_embed = word_embed emb_dim = self.word_embed.embedding_dim self.lstm = fastNLP.modules.LSTM( input_size=emb_dim, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True, batch_first=True, ) self.out = nn.ModuleList( [nn.Linear(hidden_size * 2, i) for i in n_class_per_task]) self.dropout = nn.Dropout(dropout) self.loss = nn.CrossEntropyLoss() for name, param in self.named_parameters(): if "out" in name: if param.data.dim() > 1: nn.init.xavier_uniform_(param) else: nn.init.constant_(param, 0)
def load_conllized_ontonote_NER(path, embedding_path=None): from fastNLP.io.pipe.conll import OntoNotesNERPipe ontoNotesNERPipe = OntoNotesNERPipe(lower=True, target_pad_val=-100) bundle_NER = ontoNotesNERPipe.process_from_file(path) train_set_NER = bundle_NER.datasets['train'] dev_set_NER = bundle_NER.datasets['dev'] test_set_NER = bundle_NER.datasets['test'] train_set_NER.add_seq_len('words', 'seq_len') dev_set_NER.add_seq_len('words', 'seq_len') test_set_NER.add_seq_len('words', 'seq_len') NER_vocab = bundle_NER.get_vocab('target') word_vocab = bundle_NER.get_vocab('words') if embedding_path is not None: embed = StaticEmbedding(vocab=word_vocab, model_dir_or_name=embedding_path, word_dropout=0.01, dropout=0.5, lower=True) # pretrained_embedding = load_word_emb(embedding_path, 300, word_vocab) return (train_set_NER,dev_set_NER,test_set_NER),\ (word_vocab,NER_vocab),embed else: return (train_set_NER, dev_set_NER, test_set_NER), (NER_vocab, word_vocab)
def __init__(self, vocab, d_model): super().__init__() self.emb = StaticEmbedding(vocab, model_dir_or_name="en-glove-840b-300d") self.emb_ln = nn.Linear(300, d_model) self.reset_params()
def test_same_vector(self): vocab = Vocabulary().add_word_lst(["The", "the", "THE"]) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5, lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE"]]]) words = embed(words) embed_0 = words[0, 0] for i in range(1, words.size(1)): assert torch.sum(embed_0==words[0, i]).eq(len(embed_0))
def test_case(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) embed = StaticEmbedding(vocab, embedding_dim=5) encoder = TransformerSeq2SeqEncoder(embed, num_layers=2, d_model=10, n_head=2) words_idx = torch.LongTensor([0, 1, 2]).unsqueeze(0) seq_len = torch.LongTensor([3]) encoder_output, encoder_mask = encoder(words_idx, seq_len) self.assertEqual(encoder_output.size(), (1, 3, 10))
def get_data(): data_bundle = CWSPipe(dataset_name=dataname, bigrams=True, trigrams=False).process_from_file() char_embed = StaticEmbedding( data_bundle.get_vocab('chars'), dropout=0.33, word_dropout=0.01, model_dir_or_name='~/exps/CWS/pretrain/vectors/1grams_t3_m50_corpus.txt' ) bigram_embed = StaticEmbedding( data_bundle.get_vocab('bigrams'), dropout=0.33, min_freq=3, word_dropout=0.01, model_dir_or_name='~/exps/CWS/pretrain/vectors/2grams_t3_m50_corpus.txt' ) return data_bundle, char_embed, bigram_embed
def test_norm1(self): # 测试只对可以找到的norm vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile']) embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_static_embedding/' 'glove.6B.50d_test.txt', only_norm_found_vector=True) self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1) self.assertNotEqual(torch.norm(embed(torch.LongTensor([[4]]))).item(), 1)
def load_conll_2003_mine(path, embedding_path=None, pad_val=-100): f = open(path, 'rb') data_pkl = pickle.load(f) # print(data_pkl) # print(data_pkl) train_set = data_pkl[0]['train'] dev_set = data_pkl[0]['dev'] test_set = data_pkl[0]['test'] train_set.set_pad_val('posid', pad_val) train_set.set_pad_val('nerid', pad_val) train_set.set_pad_val('chunkid', pad_val) dev_set.set_pad_val('posid', pad_val) dev_set.set_pad_val('nerid', pad_val) dev_set.set_pad_val('chunkid', pad_val) test_set.set_pad_val('posid', pad_val) test_set.set_pad_val('nerid', pad_val) test_set.set_pad_val('chunkid', pad_val) if train_set.has_field('task_id'): train_set.delete_field('task_id') if dev_set.has_field('task_id'): dev_set.delete_field('task_id') if test_set.has_field('task_id'): test_set.delete_field('task_id') if train_set.has_field('words_idx'): train_set.rename_field('words_idx', 'words') if dev_set.has_field('words_idx'): dev_set.rename_field('words_idx', 'words') if test_set.has_field('words_idx'): test_set.rename_field('words_idx', 'words') word_vocab = data_pkl[1]['words'] pos_vocab = data_pkl[1]['pos'] ner_vocab = data_pkl[1]['ner'] chunk_vocab = data_pkl[1]['chunk'] if embedding_path is not None: embed = StaticEmbedding(vocab=word_vocab, model_dir_or_name=embedding_path, word_dropout=0.01, dropout=0.5, lower=True) return (train_set, dev_set, test_set), (word_vocab, pos_vocab, ner_vocab, chunk_vocab), embed else: return (train_set, dev_set, test_set), (word_vocab, pos_vocab, ner_vocab, chunk_vocab)
def test_dropword(self): # 测试是否可以通过drop word vocab = Vocabulary().add_word_lst([chr(i) for i in range(1, 200)]) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=10, dropout=0.1, word_dropout=0.4) for i in range(10): length = torch.randint(1, 50, (1,)).item() batch = torch.randint(1, 4, (1,)).item() words = torch.randint(1, 200, (batch, length)).long() embed(words)
def load_data(): print('loading data') data = OntoNotesNERPipe(encoding_type=encoding_type).process_from_file( paths=get_path('workdir/datasets/ontonotes-v4')) print('loading embedding') word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], model_dir_or_name='en-glove-840b-300', requires_grad=True) return data, [word_embed]
def test_case(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) embed = StaticEmbedding(vocab, embedding_dim=5) encoder = LSTMSeq2SeqEncoder(embed, hidden_size=5, num_layers=1) words_idx = torch.LongTensor([0, 1, 2]).unsqueeze(0) seq_len = torch.LongTensor([3]) encoder_output, encoder_mask = encoder(words_idx, seq_len) self.assertEqual(encoder_mask.size(), (1, 3))
def load_data(): if dataset == 'vlsp2016': paths = {'test': "./data_2/test.txt", 'train': "./data_2/train.txt", 'dev': "./data_2/dev.txt"} data = VLSP2016NERPipe(encoding_type=encoding_type).process_from_file(paths) # data.get_vocab('words').clear() vocab = [] with open("vocab.txt", 'r') as files: for word in files: vocab.append(word.replace("\n", "")) data.get_vocab('words').add_word_lst(vocab) char_embed = None if char_type == 'cnn': char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3], word_dropout=0, dropout=0.3, pool_method='max' , include_word_start_end=False, min_char_freq=2) elif char_type in ['adatrans', 'naive']: char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, pool_method='max', activation='relu', min_char_freq=2, requires_grad=True, include_word_start_end=False, char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type=='naive', char_dropout=0.15, char_after_norm=True) elif char_type == 'bilstm': char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, hidden_size=100, pool_method='max', activation='relu', min_char_freq=2, bidirectional=True, requires_grad=True, include_word_start_end=False) elif char_type == 'lstm': char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0, dropout=0.3, hidden_size=100, pool_method='max', activation='relu', min_char_freq=2, bidirectional=False, requires_grad=True, include_word_start_end=False) word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='word2vec', requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) if char_embed is not None: embed = StackEmbedding([word_embed, char_embed], dropout=0, word_dropout=0.02) else: word_embed.word_drop = 0.02 embed = word_embed # print(data.get_dataset('train')) data__ = data.get_vocab('words') data.rename_field('words', 'chars') return data, embed, data__
def load_conllized_ontonote_NER_POS(path, embedding_path=None): from fastNLP.io.pipe.conll import OntoNotesNERPipe ontoNotesNERPipe = OntoNotesNERPipe(lower=True) bundle_NER = ontoNotesNERPipe.process_from_file(path) train_set_NER = bundle_NER.datasets['train'] dev_set_NER = bundle_NER.datasets['dev'] test_set_NER = bundle_NER.datasets['test'] NER_vocab = bundle_NER.get_vocab('target') word_vocab = bundle_NER.get_vocab('words') (train_set_POS, dev_set_POS, test_set_POS), (_, POS_vocab) = load_conllized_ontonote_POS(path) POS_vocab = POS_vocab['POS'] train_set_NER.add_field('pos', train_set_POS['POS'], is_target=True) dev_set_NER.add_field('pos', dev_set_POS['POS'], is_target=True) test_set_NER.add_field('pos', test_set_POS['POS'], is_target=True) if train_set_NER.has_field('target'): train_set_NER.rename_field('target', 'ner') if dev_set_NER.has_field('target'): dev_set_NER.rename_field('target', 'ner') if test_set_NER.has_field('target'): test_set_NER.rename_field('target', 'ner') if train_set_NER.has_field('pos'): train_set_NER.rename_field('pos', 'posid') if dev_set_NER.has_field('pos'): dev_set_NER.rename_field('pos', 'posid') if test_set_NER.has_field('pos'): test_set_NER.rename_field('pos', 'posid') if train_set_NER.has_field('ner'): train_set_NER.rename_field('ner', 'nerid') if dev_set_NER.has_field('ner'): dev_set_NER.rename_field('ner', 'nerid') if test_set_NER.has_field('ner'): test_set_NER.rename_field('ner', 'nerid') if embedding_path is not None: embed = StaticEmbedding(vocab=word_vocab, model_dir_or_name=embedding_path, word_dropout=0.01, dropout=0.5, lower=True) return (train_set_NER,dev_set_NER,test_set_NER),\ (word_vocab,POS_vocab,NER_vocab),embed else: return (train_set_NER, dev_set_NER, test_set_NER), (NER_vocab, word_vocab)
def prepare_env(): vocab = Vocabulary().add_word_lst("This is a test .".split()) vocab.add_word_lst("Another test !".split()) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5) encoder_output = torch.randn(2, 3, 10) src_seq_len = torch.LongTensor([3, 2]) encoder_mask = seq_len_to_mask(src_seq_len) return embed, encoder_output, encoder_mask
def __init__(self, vocab, hidden_size, num_layers, n_class_per_task, dropout=0.5, crf=False): super().__init__() # logger.info(n_class_per_task) word_embed = StaticEmbedding( vocab=vocab, model_dir_or_name="en-glove-6b-100d", word_dropout=0.01, dropout=dropout, lower=True, ) char_embed = CNNCharEmbedding(vocab=vocab, embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3], word_dropout=0, dropout=dropout, include_word_start_end=False) self.embedding = word_embed self.char = char_embed self.lstm = fastNLP.modules.LSTM( input_size=self.embedding.embedding_dim + self.char.embedding_dim, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True, batch_first=True, ) self.out = nn.ModuleList() for i, n_class in enumerate(n_class_per_task): self.out.append(nn.Linear(hidden_size * 2, n_class)) self.dropout = nn.Dropout(dropout, inplace=True) if crf: self.crf = nn.ModuleList([ fastNLP.modules.ConditionalRandomField(n_class) for n_class in n_class_per_task ]) else: self.crf = None self.criterion = nn.CrossEntropyLoss() for name, param in self.named_parameters(): if "out" in name: if param.data.dim() > 1: nn.init.xavier_uniform_(param) else: nn.init.constant_(param, 0)