def test_model(): data_path = Path( '/media/bnu/data/nlp-practice/sentiment-analysis/standford-sentiment-treebank' ) corpus = SSTCorpus(data_path) train_iter, valid_iter, test_iter = corpus.get_iterators((256, 256, 256)) model = AttnAvgModel( n_words=len(corpus.text_field.vocab), n_embed=100, p_drop=0.2, padding_idx=corpus.get_padding_idx(), ).to(DEVICE) for batch in train_iter: inputs, lengths = batch.text mask = (inputs != corpus.get_padding_idx()) outputs = model(inputs, mask) print(outputs.shape) break
def run_part1_q1(): set_random_seed(2020) show_device_info() data_path = Path( '/media/bnu/data/nlp-practice/sentiment-analysis/standford-sentiment-treebank' ) corpus = SSTCorpus(data_path) train_iter, valid_iter, test_iter = corpus.get_iterators(batch_sizes=(256, 256, 256)) ############################################################ # 模型训练 ############################################################ model = EmbedAvgModel(n_words=len(corpus.text_field.vocab), n_embed=200, p_drop=0.5, padding_idx=corpus.get_padding_idx()) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) learner = TextClassificationLearner( model, optimizer, corpus.get_padding_idx(), save_path='./models/embed-avg-best.pth') learner.fit(train_iter, valid_iter, n_epochs=20) learner.predict(test_iter) ############################################################ # 单词L2-Norm分析 ############################################################ # (n_words) embed_norm = model.embed.weight.norm(dim=1) word_idx = list(range(len(corpus.text_field.vocab))) word_idx.sort(key=lambda x: embed_norm[x]) print('\n') print('15个L2-Norm最小的单词:') for i in word_idx[:15]: print(corpus.text_field.vocab.itos[i]) print('-' * 60) print('15个L2-Norm最大的单词:') for i in word_idx[-15:]: print(corpus.text_field.vocab.itos[i])
def test_model(): data_path = Path('/media/bnu/data/nlp-practice/sentiment-analysis/standford-sentiment-treebank') corpus = SSTCorpus(data_path) train_iter, valid_iter, test_iter = corpus.get_iterators((256, 256, 256)) model = PyTorchTransformerModel( n_words=len(corpus.text_field.vocab), d_model=128, n_heads=4, n_layers=1, ) model.to(DEVICE) for batch in train_iter: inputs, lengths = batch.text mask = (inputs != corpus.get_padding_idx()) outputs = model(inputs, mask) print(outputs.shape) break
def run_part2_q3(): set_random_seed(2020) show_device_info() data_path = Path('/media/bnu/data/nlp-practice/sentiment-analysis/standford-sentiment-treebank') corpus = SSTCorpus(data_path) train_iter, valid_iter, test_iter = corpus.get_iterators(batch_sizes=(256, 256, 256)) model = PyTorchTransformerModel( n_words=len(corpus.text_field.vocab), d_model=200, n_heads=2, n_layers=3, p_drop=0.2, ) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) learner = TextClassificationLearner(model, optimizer, corpus.get_padding_idx(), save_path='./models/trans-best.pth') learner.fit(train_iter, valid_iter, n_epochs=50) learner.predict(test_iter)
def run_part2_q1(): set_random_seed(2020) show_device_info() data_path = Path('/media/bnu/data/nlp-practice/sentiment-analysis/standford-sentiment-treebank') corpus = SSTCorpus(data_path) train_iter, valid_iter, test_iter = corpus.get_iterators(batch_sizes=(256, 256, 256)) model = SimpleSelfAttentionModel( n_words=len(corpus.text_field.vocab), n_embed=200, p_drop=0.2, pad_idx=corpus.text_field.vocab.stoi['<pad>'], res_conn=False, score_fn='cos', ) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) learner = TextClassificationLearner(model, optimizer, corpus.get_padding_idx(), save_path='./models/self-attn-best.pth') learner.fit(train_iter, valid_iter, n_epochs=50) learner.predict(test_iter)
def run_part2_q2(): set_random_seed(2020) show_device_info() data_path = Path( '/media/bnu/data/nlp-practice/sentiment-analysis/standford-sentiment-treebank' ) corpus = SSTCorpus(data_path) train_iter, valid_iter, test_iter = corpus.get_iterators(batch_sizes=(256, 256, 256)) model = AttnAvgModel(n_words=len(corpus.text_field.vocab), n_embed=150, p_drop=0.5, padding_idx=corpus.get_padding_idx()) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) learner = TextClassificationLearner(model, optimizer, corpus.get_padding_idx(), save_path='./models/attn-avg-best.pth') # learner.fit(train_iter, valid_iter, n_epochs=50) learner.predict(test_iter) ############################################################ # 余弦相似度分析 ############################################################ # (1, n_embed) u = model.coef.view(1, -1) # (n_words, n_embed) embedding = model.embed.weight # (n_words) cos_sim = F.cosine_similarity(u, embedding, dim=-1) word_idx = list(range(len(corpus.text_field.vocab))) word_idx.sort(key=lambda x: cos_sim[x]) print('15个余弦相似度最小的单词:') for i in word_idx[:15]: print(corpus.text_field.vocab.itos[i]) print('-' * 60) print('15个余弦相似度最大的单词:') for i in word_idx[-15:]: print(corpus.text_field.vocab.itos[i]) ############################################################ # 分析单词Attention权重 ############################################################ train_iter, valid_iter, test_iter = corpus.get_iterators(batch_sizes=(1, 1, 1)) weight_dict = defaultdict(list) with torch.no_grad(): for k, batch in enumerate(train_iter): inputs, lengths = batch.text attn = model.calc_attention_weight(inputs) inputs = inputs.view(-1) attn = attn.view(-1) if inputs.shape[0] == 1: weight_dict[inputs.item()].append(attn.item()) else: for i in range(len(inputs)): weight_dict[inputs[i].item()].append(attn[i].item()) if (k + 1) % 10000 == 0: print(f'{k+1} sentences finish!') mean_dict, std_dict = {}, {} for k, v in weight_dict.items(): # 至少出现100次 if len(v) >= 100: mean_dict[k] = np.mean(v) std_dict[k] = np.std(v) word_idx = list(std_dict.keys()) word_idx.sort(key=lambda x: std_dict[x], reverse=True) print('30个Attention标准差最大的单词:') print('-' * 60) for i in word_idx[:30]: print( f'{corpus.text_field.vocab.itos[i]}, {len(weight_dict[i])}, {std_dict[i]:.3f}, {mean_dict[i]:.3f}' ) print() print('30个Attention标准差最小的单词:') print('-' * 60) for i in reversed(word_idx[-30:]): print( f'{corpus.text_field.vocab.itos[i]}, {len(weight_dict[i])}, {std_dict[i]:.3f}, {mean_dict[i]:.3f}' )