def load_data(args): global word_dict, word_embed docs = [] docs += du.load_sent('../datasets/bbcnews.txt') # BBC_news # docs += du.load_sent('../datasets/BBC_news.txt') word_dict = util.build_dict(docs) # inv_dict = util.build_inv_dict(word_dict) word_embed = util.words2embedding(word_dict, 100, args.embedding_file) print('word_dict:', word_dict) with open('../datasets/word_dict', 'wb') as fid: dump(word_dict, fid) doc = ' '.join(docs) return doc
def test1(args): docs = [] docs += du.load_sent('../datasets/bbcnews.txt') logging.info('docs: {}'.format(len(docs))) logging.info("building dictionary...") word_dict, char_dict = util.build_dict(docs) word_embed = util.words2embedding(word_dict, 100, args.embedding_file) (args.word_vocab_size, args.word_embed_size) = word_embed.shape logging.info('docs: {}'.format(word_embed.shape)) # (119, 100) # Words: 117 -> 117 print(word_dict) doc = ' '.join(docs[0]) # with open('bbcnews.txt') as f: # docs = f.read() # sp.build_graph(doc) vertice_map = sp.hash_vertex(doc) for vertice in vertice_map: print(words2word(vertice[0],word_embed,word_dict))
def main(args): logging.info("loading data...") fake_train, fake_dev, fake_test = du.load_fake(doc_ling=False, sent_ling=False) true_train, true_dev, true_test = du.load_true(doc_ling=False, sent_ling=False) if args.debug: true_train = true_train[0][:100] fake_train = fake_train[:10] true_dev = true_dev[:100] fake_dev = fake_dev[:10] true_test = true_test[:100] fake_test = fake_test[:10] if args.rnn_type == 'gru': args.rnn = lasagne.layers.GRULayer elif args.rnn_type == 'lstm': args.rnn = lasagne.layers.LSTMLayer else: args.rnn = lasagne.layers.RecurrentLayer logging.info("building dictionary...") word_dict, char_dict = util.build_dict( None, max_words=0, dict_file=["word_dict", "char_dict"]) logging.info("creating embedding matrix...") word_embed = util.words2embedding(word_dict, 100, args.embedding_file) char_embed = util.char2embedding(char_dict, 30) (args.word_vocab_size, args.word_embed_size) = word_embed.shape (args.char_vocab_size, args.char_embed_size) = char_embed.shape logging.info("compiling Theano function...") att_fn, eval_fn, train_fn, params = create_theano_function(word_embed, char_embed, values=None) logging.info("batching examples...") dev_examples = mb.vec_minibatch(fake_dev + true_dev, word_dict, char_dict, args, False, char=False, sent_ling=False, doc_ling=False) test_examples = mb.vec_minibatch(fake_test + true_test, word_dict, char_dict, args, False, char=False, sent_ling=False, doc_ling=False) temp = [] for true_batch in true_train: temp += true_batch true_train = temp del temp train_examples = mb.doc_minibatch(fake_train + true_train, args.batch_size) # train_examples = mb.train_doc_minibatch(fake_train, true_train, args) logging.info("checking network...") dev_acc = evals.eval_vec_batch(eval_fn, dev_examples, char=False, sent_ling=False, doc_ling=False) print('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc) test_acc = evals.eval_vec_batch(eval_fn, test_examples, char=False, sent_ling=False, doc_ling=False) print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc) prev_fsc = 0 stop_count = 0 best_fsc = 0 best_acc = 0 logging.info("training %d examples" % len(train_examples)) start_time = time.time() n_updates = 0 for epoch in range(args.epoches): np.random.shuffle(train_examples) # if epoch > 3: # logging.info("compiling Theano function again...") # args.learning_rate *= 0.9 # att_fn, eval_fn, train_fn, params = create_theano_function( # word_embed, char_embed, values=[x.get_value() for x in params]) for batch_x, _ in train_examples: batch_x, batch_y = zip(*batch_x) batch_x = util.vectorization(list(batch_x), word_dict, char_dict, max_char_length=args.max_char) batch_rnn, batch_sent_mask, batch_word_mask, _ = \ util.mask_padding(batch_x, args.max_sent, args.max_word, args.max_char) batch_y = np.array(list(batch_y)) train_loss = train_fn(batch_rnn, batch_word_mask, batch_sent_mask, batch_y) n_updates += 1 if n_updates % 100 == 0 and epoch > 7: logging.info( 'Epoch = %d, loss = %.2f, elapsed time = %.2f (s)' % (epoch, train_loss, time.time() - start_time)) # dev_acc = evals.eval_batch(eval_fn, dev_examples, word_dict, char_dict, args) dev_acc = evals.eval_vec_batch(eval_fn, dev_examples, char=False, sent_ling=False, doc_ling=False) logging.info('Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc) if dev_acc[3] > best_fsc and dev_acc[0] > best_acc: best_fsc = dev_acc[3] best_acc = dev_acc[0] logging.info( 'Best dev f1: epoch = %d, n_udpates = %d, f1 = %.2f %%' % (epoch, n_updates, dev_acc[3])) record = 'Best dev accuracy: epoch = %d, n_udpates = %d ' % \ (epoch, n_updates) + ' Dev A: %.2f P:%.2f R:%.2f F:%.2f' % dev_acc test_acc = evals.eval_vec_batch(eval_fn, test_examples, char=False, sent_ling=False, doc_ling=False) print( 'Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc) # util.save_params('char_not_params_%.2f' % test_acc[3], params, # epoch=epoch, n_updates=n_updates) if prev_fsc > dev_acc[3]: stop_count += 1 else: stop_count = 0 if stop_count == 6: print("stopped") prev_fsc = dev_acc[3] print(record) print('Performance on Test set: A: %.2f P:%.2f R:%.2f F:%.2f' % test_acc) return
def train(trainloader, args): device = 'cuda' if args.cuda else 'cpu' losses = [] # TODO: concretize the embedding step # BASE = '/homes/du113/scratch/satire-models/' print('load word dict...') BASE = '../datasets/dict/' with open(BASE + 'cnn_dict.pkl', 'rb') as fid: word_dict = load(fid) print('loading completed!') print(len(word_dict)) print('trump:', word_dict['trump']) print('obama:', word_dict['obama']) # load word embedding word_embed = util.words2embedding(word_dict, 100, args.embedding_file) trump_emb_before = word_embed[word_dict['trump']] obama_emb_before = word_embed[word_dict['obama']] dist_abs_before = distance_abs(trump_emb_before, obama_emb_before) dist_mul_before = distance_mul(trump_emb_before, obama_emb_before) # print(word_embed.shape) model = FEELModel(pretrained=True, embeddings=word_embed) model = model.double() if device == 'cuda': model = nn.DataParallel(model).cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr) for e in range(args.epochs): logging.warning('{}th epoch'.format(e)) total_loss = 0 for i, batch in enumerate(trainloader): # each tensor has size (batchsize x 4 x arg.max_word_length) # most probably a 3 dimensional tensor # query, pos, neg = batch # query_zip, pos_zip, neg_zip = batch # # TODO: query should contain both vertex id(query[0]) and word_idx_list(query[1]) # qtree = du.build_tree(query_zip[0]) # ptree = du.build_tree(pos_zip[0]) # ntree = du.build_tree(neg_zip[0]) # query, pos, neg = query_zip[1], pos_zip[1], neg_zip[1] query, pos, neg = batch print('neg.shape:', neg.shape) # torch.Size([16, 4, 203]) qtree = du.build_tree(query) ptree = du.build_tree(pos) ntree = du.build_tree(neg) # print(query.shape) 2d # print(pos.shape) ''' q_a0, q_v, q_a1, q_a2, q_m = query[:,0], query[:,1], \ query[:,2], query[:,3], query[:,4] p_a0, p_v, p_a1, p_a2, p_m = pos[:,0], pos[:,1], \ pos[:,2], pos[:,3], pos[:,4] n_a0, n_v, n_a1, n_a2, n_m = neg[:,0], neg[:,1], \ neg[:,2], neg[:,3], neg[:,4] q_a0, q_v, q_a1, q_a2, q_m = Variable(q_a0).to(device), \ Variable(q_v).to(device), \ Variable(q_a1).to(device), \ Variable(q_a2).to(device), \ Variable(q_m).to(device) p_a0, p_v, p_a1, p_a2, p_m = Variable(p_a0).to(device), \ Variable(p_v).to(device), \ Variable(p_a1).to(device), \ Variable(p_a2).to(device), \ Variable(p_m).to(device) n_a0, n_v, n_a1, n_a2, n_m = Variable(n_a0).to(device), \ Variable(n_v).to(device), \ Variable(n_a1).to(device), \ Variable(n_a2).to(device), \ Variable(n_m).to(device) model.zero_grad() output = model((q_v, p_v, n_v)) + model((q_v, q_a0, n_a0)) \ + model((q_v, q_a1, n_a1)) + model((q_v, q_a2, n_a2)) \ + model((q_v, q_m, n_m)) ''' q_v, q_a0, q_a1, q_a2 = query[:, 0, :], query[:, 1, :], \ query[:, 2, :], query[:, 3, :] p_v, p_a0, p_a1, p_a2 = pos[:, 0, :], pos[:, 1, :], \ pos[:, 2, :], pos[:, 3, :] n_v, n_a0, n_a1, n_a2 = neg[:, 0, :], neg[:, 1, :], \ neg[:, 2, :], neg[:, 3, :] q_v, q_a0, q_a1, q_a2 = \ Variable(q_v).to(device), \ Variable(q_a0).to(device), \ Variable(q_a1).to(device), \ Variable(q_a2).to(device) p_v, p_a0, p_a1, p_a2 = \ Variable(p_v).to(device), \ Variable(p_a0).to(device), \ Variable(p_a1).to(device), \ Variable(p_a2).to(device) n_v, n_a0, n_a1, n_a2 = \ Variable(n_v).to(device), \ Variable(n_a0).to(device), \ Variable(n_a1).to(device), \ Variable(n_a2).to(device) query, pos, neg = \ Variable(query).to(device), \ Variable(pos).to(device), \ Variable(neg).to(device) model.zero_grad() inputs = (q_v, q_a0, n_a0), (q_v, q_a1, n_a1), (q_v, q_a2, n_a2),\ (query, pos, neg), (qtree, ptree, ntree) output = model(inputs) loss = my_loss_fn(output) loss.backward() optimizer.step() total_loss += loss.item() if i % 20 == 0: logging.warning('{}th iteration'.format(i)) logging.warning('loss: {}'.format(total_loss)) losses.append(total_loss) trump_emb_after = model.embeddings.weight[word_dict['trump']] obama_emb_after = model.embeddings.weight[word_dict['obama']] dist_abs_after = distance_abs(trump_emb_after, obama_emb_after) dist_mul_after = distance_mul(trump_emb_after, obama_emb_after) print( '{}th epoch, distance abs before: {}, distance abs after: {}, distance mul before: {}, distance mul after: {}' .format(e, dist_abs_before, dist_abs_after, dist_mul_before, dist_mul_after)) trump_emb_after = model.embeddings.weight[word_dict['trump']] obama_emb_after = model.embeddings.weight[word_dict['obama']] dist_abs_after = distance_abs(trump_emb_after, obama_emb_after) dist_mul_after = distance_mul(trump_emb_after, obama_emb_after) print( 'distance abs before: {}, distance abs after: {}, distance mul before: {}, distance mul after: {}' .format(dist_abs_before, dist_abs_after, dist_mul_before, dist_mul_after)) # plot x = list(range(len(losses))) plt.plot(x, losses, 'ro') # plt.axis([-2, 2, -2, 2]) plt.xlabel('epoch') plt.ylabel('loss') plt.title('loss') # plt.show() plt.savefig("../datasets/plot/loss_treeLSTM.png")
def train(trainloader, args): device = 'cuda' if args.cuda else 'cpu' losses = [] # TODO: concretize the embedding step # BASE = '/homes/du113/scratch/satire-models/' BASE = '../datasets/' with open(BASE + 'word_dict', 'rb') as fid: word_dict = load(fid) # load word embedding word_embed = util.words2embedding(word_dict, 100, args.embedding_file) # print(word_embed.shape) model_intra = FEELModel_intra(pretrained=True, embeddings=word_embed) model_inter = FEELModel_inter() if device == 'cuda': model_inter = nn.DataParallel(model_inter).cuda() model_intra = nn.DataParallel(model_intra).cuda() optimizer = optim.SGD(model_intra.parameters(), lr=args.lr) for e in range(args.epochs): logging.warning('{}th epoch'.format(e)) total_loss = 0 for i, batch in enumerate(trainloader): # each tensor has size (batchsize x 4 x arg.max_word_length) # most probably a 3 dimensional tensor query, pos, neg = batch # print(pos.shape) # print(neg.shape) ''' q_a0, q_v, q_a1, q_a2, q_m = query[:,0], query[:,1], \ query[:,2], query[:,3], query[:,4] p_a0, p_v, p_a1, p_a2, p_m = pos[:,0], pos[:,1], \ pos[:,2], pos[:,3], pos[:,4] n_a0, n_v, n_a1, n_a2, n_m = neg[:,0], neg[:,1], \ neg[:,2], neg[:,3], neg[:,4] q_a0, q_v, q_a1, q_a2, q_m = Variable(q_a0).to(device), \ Variable(q_v).to(device), \ Variable(q_a1).to(device), \ Variable(q_a2).to(device), \ Variable(q_m).to(device) p_a0, p_v, p_a1, p_a2, p_m = Variable(p_a0).to(device), \ Variable(p_v).to(device), \ Variable(p_a1).to(device), \ Variable(p_a2).to(device), \ Variable(p_m).to(device) n_a0, n_v, n_a1, n_a2, n_m = Variable(n_a0).to(device), \ Variable(n_v).to(device), \ Variable(n_a1).to(device), \ Variable(n_a2).to(device), \ Variable(n_m).to(device) model.zero_grad() output = model((q_v, p_v, n_v)) + model((q_v, q_a0, n_a0)) \ + model((q_v, q_a1, n_a1)) + model((q_v, q_a2, n_a2)) \ + model((q_v, q_m, n_m)) ''' q_v, q_a0, q_a1, q_a2 = query[:, 0, :], query[:, 1, :], \ query[:, 2, :], query[:, 3, :] p_v, p_a0, p_a1, p_a2 = pos[:, 0, :], pos[:, 1, :], \ pos[:, 2, :], pos[:, 3, :] n_v, n_a0, n_a1, n_a2 = neg[:, 0, :], neg[:, 1, :], \ neg[:, 2, :], neg[:, 3, :] # torch.Size([2, 4, 3]) -> torch.Size([2, 3]) q_v, q_a0, q_a1, q_a2 = \ Variable(q_v).to(device), \ Variable(q_a0).to(device), \ Variable(q_a1).to(device), \ Variable(q_a2).to(device) p_v, p_a0, p_a1, p_a2 = \ Variable(p_v).to(device), \ Variable(p_a0).to(device), \ Variable(p_a1).to(device), \ Variable(p_a2).to(device) n_v, n_a0, n_a1, n_a2 = \ Variable(n_v).to(device), \ Variable(n_a0).to(device), \ Variable(n_a1).to(device), \ Variable(n_a2).to(device) query, pos, neg = \ Variable(query).to(device), \ Variable(pos).to(device), \ Variable(neg).to(device) model.zero_grad() output1 = model_intra((q_v, q_a0, n_a0)) output2 = model_intra((q_v, q_a1, n_a1)) output3 = model_intra((q_v, q_a2, n_a2)) output_inter = model_inter((query, pos, neg)) # query, pos, neg: 2d wordidx matrix -> average to 1d list emb # the output should be of dimensions (batch_size) output = output1 + output2 + output3 + output_inter loss = my_loss_fn(output) loss.backward() optimizer.step() total_loss += loss.item() if i % 20 == 0: logging.warning('{}th iteration'.format(i)) logging.warning('loss: {}'.format(total_loss)) losses.append(total_loss)