def evaluation(args, padding_id, ids_corpus, vocab_map, embeddings, model): """Calculate the AUC score of the model on Android data. """ meter = AUCMeter() print "starting evaluation" val_data = corpus.read_annotations(args.test) print "number of lines in test data: " + str(len(val_data)) val_batches = corpus.create_eval_batches(ids_corpus, val_data, padding_id) count = 0 similarities = [] for batch in val_batches: titles, bodies, qlabels = batch title_length, title_num_questions = titles.shape body_length, body_num_questions = bodies.shape title_embeddings, body_embeddings = corpus.get_embeddings(titles, bodies, vocab_map, embeddings) if args.model == 'lstm': if args.cuda: title_inputs = [autograd.Variable(torch.FloatTensor(title_embeddings).cuda())] title_inputs = torch.cat(title_inputs).view(title_length, title_num_questions, -1) title_hidden = (autograd.Variable(torch.zeros(1, title_num_questions, args.hidden_size).cuda()), autograd.Variable(torch.zeros((1, title_num_questions, args.hidden_size)).cuda())) else: title_inputs = [autograd.Variable(torch.FloatTensor(title_embeddings))] title_inputs = torch.cat(title_inputs).view(title_length, title_num_questions, -1) # title_inputs = torch.cat(title_inputs).view(title_num_questions, title_length, -1) title_hidden = (autograd.Variable(torch.zeros(1, title_num_questions, args.hidden_size)), autograd.Variable(torch.zeros((1, title_num_questions, args.hidden_size)))) else: if args.cuda: title_inputs = [autograd.Variable(torch.FloatTensor(title_embeddings).cuda())] else: title_inputs = [autograd.Variable(torch.FloatTensor(title_embeddings))] title_inputs = torch.cat(title_inputs).transpose(0,1).transpose(1,2) if args.model == 'lstm': title_out, title_hidden = model(title_inputs, title_hidden) else: title_out = model(title_inputs) title_out = F.tanh(title_out) title_out = title_out.transpose(1,2).transpose(0,1) average_title_out = average_questions(title_out, titles, padding_id) # body if args.model == 'lstm': if args.cuda: body_inputs = [autograd.Variable(torch.FloatTensor(body_embeddings).cuda())] body_inputs = torch.cat(body_inputs).view(body_length, body_num_questions, -1) body_hidden = (autograd.Variable(torch.zeros(1, body_num_questions, args.hidden_size).cuda()), autograd.Variable(torch.zeros((1, body_num_questions, args.hidden_size)).cuda())) else: body_inputs = [autograd.Variable(torch.FloatTensor(body_embeddings))] body_inputs = torch.cat(body_inputs).view(body_length, body_num_questions, -1) body_hidden = (autograd.Variable(torch.zeros(1, body_num_questions, args.hidden_size)), autograd.Variable(torch.zeros((1, body_num_questions, args.hidden_size)))) else: if args.cuda: body_inputs = [autograd.Variable(torch.FloatTensor(body_embeddings).cuda())] #body_inputs = torch.cat(body_inputs).view(body_num_questions, 200, -1) else: body_inputs = [autograd.Variable(torch.FloatTensor(body_embeddings))] #body_inputs = torch.cat(body_inputs).view(body_num_questions, 200, -1) body_inputs = torch.cat(body_inputs).transpose(0,1).transpose(1,2) if args.model == 'lstm': body_out, body_hidden = model(body_inputs, body_hidden) else: body_out = model(body_inputs) body_out = F.tanh(body_out) body_out = body_out.transpose(1,2).transpose(0,1) # average all words of each question from body_out average_body_out = average_questions(body_out, bodies, padding_id) # average body and title # representations of the questions as found by the CNN # 560 x 100 hidden = (average_title_out + average_body_out) * 0.5 query = torch.DoubleTensor(hidden[0].unsqueeze(0).cpu().data.numpy()) examples = torch.DoubleTensor(hidden[1:].cpu().data.numpy()) cos_similarity = F.cosine_similarity(query, examples, dim=1) qlabels = [float(qlabel) for qlabel in list(qlabels)] target = torch.DoubleTensor(qlabels) meter.add(cos_similarity, target) print meter.value(0.05)
def main(args): time1 = datetime.now() raw_corpus = corpus.read_corpus(args.corpus) list_words, vocab_map, embeddings, padding_id = corpus.load_embeddings( corpus.load_embedding_iterator(args.embeddings)) print("loaded embeddings") ids_corpus = corpus.map_corpus(vocab_map, raw_corpus) annotations = corpus.read_annotations(args.train) print("got annotations") training_batches = corpus.create_batches(ids_corpus, annotations, args.batch_size, padding_id) print("got batches") time2 = datetime.now() print "time to preprocess: " + str(time2 - time1) if args.model == 'cnn': args.margin = 0.2 if args.load_model: if args.model == 'lstm': print("loading " + args.load_model) lstm = nn.LSTM(input_size=args.embedding_size, hidden_size=args.hidden_size) lstm.load_state_dict(torch.load(args.load_model)) optimizer = Adam(lstm.parameters()) if args.cuda: lstm.cuda() else: print("loading " + args.load_model) cnn = nn.Conv1d(in_channels=args.embedding_size, out_channels=args.hidden_size, kernel_size=3, padding=1) cnn.load_state_dict(torch.load(args.load_model)) optimizer = Adam(cnn.parameters()) if args.cuda: cnn.cuda() else: if args.model == 'lstm': print "training lstm" lstm = nn.LSTM(input_size=args.embedding_size, hidden_size=args.hidden_size) optimizer = Adam(lstm.parameters()) if args.cuda: lstm.cuda() else: print "training cnn" cnn = nn.Conv1d(in_channels=args.embedding_size, out_channels=args.hidden_size, kernel_size=3, padding=1) optimizer = Adam(cnn.parameters()) if args.cuda: cnn.cuda() if args.save_model: if args.model == 'lstm': lstm_model_nums = [] for d in os.listdir("lstm_models"): if "lstm_model" in d: num = int(d[len("lstm_models") - 1:]) lstm_model_nums.append(num) if len(lstm_model_nums) > 0: new_model_num = max(lstm_model_nums) + 1 else: new_model_num = 0 print("creating new model " + "lstm_models/lstm_model" + str(new_model_num)) os.makedirs("lstm_models/lstm_model" + str(new_model_num)) else: cnn_model_nums = [] for d in os.listdir("cnn_models"): if "cnn_model" in d: num = int(d[len("cnn_models") - 1:]) cnn_model_nums.append(num) if len(cnn_model_nums) > 0: new_model_num = max(cnn_model_nums) + 1 else: new_model_num = 0 print("creating new model " + "cnn_models/cnn_model" + str(new_model_num)) os.makedirs("cnn_models/cnn_model" + str(new_model_num)) # lstm tutorial: http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html # lstm documentation: http://pytorch.org/docs/master/nn.html?highlight=nn%20lstm#torch.nn.LSTM count = 1 hidden_states = [] total_loss = 0.0 time_begin = datetime.now() for epoch in range(10): print "epoch = " + str(epoch) for batch in training_batches: optimizer.zero_grad() if count % 10 == 0: print(count) print "average loss: " + str((total_loss / float(count))) print("time for 10 batches: " + str(datetime.now() - time_begin)) time_begin = datetime.now() titles, bodies, triples = batch title_length, title_num_questions = titles.shape body_length, body_num_questions = bodies.shape title_embeddings, body_embeddings = corpus.get_embeddings( titles, bodies, vocab_map, embeddings) # title if args.model == 'lstm': if args.cuda: title_inputs = [ autograd.Variable( torch.FloatTensor(title_embeddings).cuda()) ] title_inputs = torch.cat(title_inputs).view( title_length, title_num_questions, -1) # title_inputs = torch.cat(title_inputs).view(title_num_questions, title_length, -1) title_hidden = (autograd.Variable( torch.zeros(1, title_num_questions, args.hidden_size).cuda()), autograd.Variable( torch.zeros( (1, title_num_questions, args.hidden_size)).cuda())) else: title_inputs = [ autograd.Variable(torch.FloatTensor(title_embeddings)) ] title_inputs = torch.cat(title_inputs).view( title_length, title_num_questions, -1) title_hidden = (autograd.Variable( torch.zeros(1, title_num_questions, args.hidden_size)), autograd.Variable( torch.zeros((1, title_num_questions, args.hidden_size)))) else: if args.cuda: title_inputs = [ autograd.Variable( torch.FloatTensor(title_embeddings).cuda()) ] else: title_inputs = [ autograd.Variable(torch.FloatTensor(title_embeddings)) ] title_inputs = torch.cat(title_inputs).transpose(0, 1).transpose( 1, 2) if args.model == 'lstm': title_out, title_hidden = lstm(title_inputs, title_hidden) else: title_out = cnn(title_inputs) title_out = F.tanh(title_out) title_out = title_out.transpose(1, 2).transpose(0, 1) # average all words of each question from title_out # title_out (max sequence length) x (batch size) x (hidden size) average_title_out = average_questions(title_out, titles, padding_id) # body if args.model == 'lstm': if args.cuda: body_inputs = [ autograd.Variable( torch.FloatTensor(body_embeddings).cuda()) ] body_inputs = torch.cat(body_inputs).view( body_length, body_num_questions, -1) # body_inputs = torch.cat(body_inputs).view(body_num_questions, body_length, -1) body_hidden = (autograd.Variable( torch.zeros(1, body_num_questions, args.hidden_size).cuda()), autograd.Variable( torch.zeros((1, body_num_questions, args.hidden_size)).cuda())) else: body_inputs = [ autograd.Variable(torch.FloatTensor(body_embeddings)) ] body_inputs = torch.cat(body_inputs).view( body_length, body_num_questions, -1) body_hidden = (autograd.Variable( torch.zeros(1, body_num_questions, args.hidden_size)), autograd.Variable( torch.zeros((1, body_num_questions, args.hidden_size)))) else: if args.cuda: body_inputs = [ autograd.Variable( torch.FloatTensor(body_embeddings).cuda()) ] else: body_inputs = [ autograd.Variable(torch.FloatTensor(body_embeddings)) ] body_inputs = torch.cat(body_inputs).transpose(0, 1).transpose( 1, 2) if args.model == 'lstm': body_out, body_hidden = lstm(body_inputs, body_hidden) else: body_out = cnn(body_inputs) body_out = F.tanh(body_out) body_out = body_out.transpose(1, 2).transpose(0, 1) average_body_out = average_questions(body_out, bodies, padding_id) count += 1 # average body and title # representations of the questions as found by the LSTM hidden = (average_title_out + average_body_out) * 0.5 if args.cuda: triples_vectors = hidden[torch.LongTensor( triples.ravel()).cuda()] else: triples_vectors = hidden[torch.LongTensor(triples.ravel())] triples_vectors = triples_vectors.view(triples.shape[0], triples.shape[1], args.hidden_size) query = triples_vectors[:, 0, :].unsqueeze(1) examples = triples_vectors[:, 1:, :] cos_similarity = F.cosine_similarity(query, examples, dim=2) if args.cuda: targets = autograd.Variable( torch.zeros(triples.shape[0]).type( torch.LongTensor).cuda()) else: targets = autograd.Variable( torch.zeros(triples.shape[0]).type(torch.LongTensor)) # outputs a Variable # By default, the losses are averaged over observations for each minibatch if args.cuda: loss = F.multi_margin_loss(cos_similarity, targets, margin=args.margin).cuda() else: loss = F.multi_margin_loss(cos_similarity, targets, margin=args.margin) total_loss += loss.cpu().data.numpy()[0] loss.backward() optimizer.step() result_headers = ['Epoch', 'MAP', 'MRR', 'P@1', 'P@5'] with open(os.path.join(sys.path[0], args.results_file), 'a') as evaluate_file: writer = csv.writer(evaluate_file, dialect='excel') writer.writerow(result_headers) if args.model == 'lstm': evaluation(args, padding_id, ids_corpus, vocab_map, embeddings, lstm, epoch) else: evaluation(args, padding_id, ids_corpus, vocab_map, embeddings, cnn, epoch) if args.save_model: # saving the model if args.model == 'lstm': print "Saving lstm model epoch " + str( epoch) + " to lstm_model" + str(new_model_num) torch.save( lstm.state_dict(), "lstm_models/lstm_model" + str(new_model_num) + '/' + "epoch" + str(epoch)) else: print "Saving cnn model epoch " + str( epoch) + " to cnn_model" + str(new_model_num) torch.save( cnn.state_dict(), "cnn_models/cnn_model" + str(new_model_num) + '/' + "epoch" + str(epoch))
def main(args): """This file performs domain transfer using an adversarial discriminative network. Example usage: python adversarial_domain.py --ubuntu_path ../askubuntu --android_path ../Android --embeddings ../glove.pruned.txt.gz""" ubuntu_corpus = os.path.join(args.ubuntu_path, 'text_tokenized.txt.gz') android_corpus = os.path.join(args.android_path, 'corpus.tsv.gz') ubuntu_raw_corpus = corpus.read_corpus(ubuntu_corpus) android_raw_corpus = corpus.read_corpus(android_corpus) list_words, vocab_map, embeddings, padding_id = corpus.load_embeddings( corpus.load_embedding_iterator(args.embeddings)) print "loaded embeddings" ubuntu_ids_corpus = corpus.map_corpus(vocab_map, ubuntu_raw_corpus) android_ids_corpus = corpus.map_corpus(vocab_map, android_raw_corpus) ubuntu_train = os.path.join(args.ubuntu_path, 'train_random.txt') ubuntu_train_annotations = corpus.read_annotations(ubuntu_train) print len(ubuntu_train_annotations) ubuntu_training_batches = corpus.create_batches(ubuntu_ids_corpus, ubuntu_train_annotations, args.batch_size, padding_id) print "got ubuntu batches" if args.load_model: if args.model == 'lstm': print("loading " + args.load_model) lstm = nn.LSTM(input_size=300, hidden_size=args.hidden_size) lstm.load_state_dict(torch.load(args.load_model)) optimizer = Adam(lstm.parameters()) if args.cuda: lstm.cuda() else: print("loading " + args.load_model) cnn = nn.Conv1d(in_channels=300, out_channels=args.hidden_size, kernel_size=3, padding=1) cnn.load_state_dict(torch.load(args.load_model)) optimizer = Adam(cnn.parameters()) if args.cuda: cnn.cuda() else: if args.model == 'lstm': print "training lstm" lstm = nn.LSTM(input_size=300, hidden_size=args.hidden_size) optimizer = Adam(lstm.parameters()) if args.cuda: lstm.cuda() else: print "training cnn" cnn = nn.Conv1d(in_channels=300, out_channels=args.hidden_size, kernel_size=3, padding=1) optimizer = Adam(cnn.parameters()) if args.cuda: cnn.cuda() feed_forward = FeedForward(args) if args.cuda: feed_forward.cuda() feed_forward_optimizer = Adam(feed_forward.parameters(), lr=-0.001) android_dev_pos_path = os.path.join(args.android_path, 'dev.pos.txt') android_dev_neg_path = os.path.join(args.android_path, 'dev.neg.txt') android_dev_annotations = android_pairs_to_annotations( android_dev_pos_path, android_dev_neg_path) count = 1 hidden_states = [] total_encoder_loss = 0.0 total_domain_loss = 0.0 total_loss = 0.0 time_begin = datetime.now() time_begin_epoch = datetime.now() for epoch in range(20): print "epoch = " + str(epoch) for batch in ubuntu_training_batches: titles, bodies, triples = batch optimizer.zero_grad() if count % 10 == 0: print(count) print "average encoder loss: " + str( (total_encoder_loss / float(count))) print "average domain loss: " + str( (total_domain_loss / float(count))) print "average loss: " + str((total_loss / float(count))) print("time for 10 batches: " + str(datetime.now() - time_begin)) time_begin = datetime.now() count += 1 ubuntu_batch = corpus.domain_classifier_batch( ubuntu_ids_corpus, ubuntu_train_annotations, padding_id) ubuntu_titles, ubuntu_bodies, _ = ubuntu_batch android_batch = corpus.domain_classifier_batch( android_ids_corpus, android_dev_annotations, padding_id) android_titles, android_bodies, _ = android_batch # print "shapes" # print ubuntu_titles.shape # print android_titles.shape if args.model == 'lstm': model = lstm else: model = cnn hidden_ubuntu = vectorize_question(args, batch, model, vocab_map, embeddings, padding_id) hidden_ubuntu_domain = vectorize_question(args, ubuntu_batch, model, vocab_map, embeddings, padding_id) hidden_android_domain = vectorize_question(args, android_batch, model, vocab_map, embeddings, padding_id) hidden_combined = torch.cat( (hidden_ubuntu_domain, hidden_android_domain)) input_size = int(hidden_combined.size()[0]) output = feed_forward.forward(hidden_combined) domain_labels = [1] * int(hidden_ubuntu_domain.size( )[0]) + [0] * int(hidden_android_domain.size()[0]) if args.cuda: domain_labels = autograd.Variable( torch.LongTensor(domain_labels).cuda()) else: domain_labels = autograd.Variable( torch.LongTensor(domain_labels)) if args.cuda: triples_vectors = hidden_ubuntu[torch.LongTensor( triples.ravel()).cuda()] else: triples_vectors = hidden_ubuntu[torch.LongTensor( triples.ravel())] triples_vectors = triples_vectors.view(triples.shape[0], triples.shape[1], args.hidden_size) query = triples_vectors[:, 0, :].unsqueeze(1) examples = triples_vectors[:, 1:, :] cos_similarity = F.cosine_similarity(query, examples, dim=2) if args.cuda: targets = autograd.Variable( torch.zeros(triples.shape[0]).type( torch.LongTensor).cuda()) else: targets = autograd.Variable( torch.zeros(triples.shape[0]).type(torch.LongTensor)) if args.cuda: encoder_loss = F.multi_margin_loss(cos_similarity, targets, margin=args.margin).cuda() else: encoder_loss = F.multi_margin_loss(cos_similarity, targets, margin=args.margin) total_encoder_loss += encoder_loss.cpu().data.numpy()[0] # if args.cuda: # domain_loss_func = nn.CrossEntropyLoss().cuda() # else: # domain_loss_func = nn.CrossEntropyLoss() # domain_classifier_loss = domain_loss_func(output, domain_labels) if args.cuda: domain_classifier_loss = F.cross_entropy( output, domain_labels).cuda() else: domain_classifier_loss = F.cross_entropy(output, domain_labels) total_domain_loss += domain_classifier_loss.cpu().data.numpy()[0] combined_loss = encoder_loss - args.lam * domain_classifier_loss total_loss += combined_loss.cpu().data.numpy()[0] combined_loss.backward() optimizer.step() feed_forward_optimizer.step() print "time for one epoch: " + str(datetime.now() - time_begin_epoch) time_begin_epoch = datetime.now() evaluation(args, padding_id, android_ids_corpus, model, vocab_map, embeddings)
def evaluation(args, padding_id, ids_corpus, vocab_map, embeddings, model, epoch): print "starting evaluation" val_data = corpus.read_annotations(args.test) print "number of lines in test data: " + str(len(val_data)) val_batches = corpus.create_eval_batches(ids_corpus, val_data, padding_id) count = 0 similarities = [] if args.model == 'lstm': lstm = model else: cnn = model for batch in val_batches: titles, bodies, qlabels = batch title_length, title_num_questions = titles.shape body_length, body_num_questions = bodies.shape title_embeddings, body_embeddings = corpus.get_embeddings( titles, bodies, vocab_map, embeddings) if args.model == 'lstm': if args.cuda: title_inputs = [ autograd.Variable( torch.FloatTensor(title_embeddings).cuda()) ] title_inputs = torch.cat(title_inputs).view( title_length, title_num_questions, -1) title_hidden = (autograd.Variable( torch.zeros(1, title_num_questions, args.hidden_size).cuda()), autograd.Variable( torch.zeros((1, title_num_questions, args.hidden_size)).cuda())) else: title_inputs = [ autograd.Variable(torch.FloatTensor(title_embeddings)) ] title_inputs = torch.cat(title_inputs).view( title_length, title_num_questions, -1) title_hidden = (autograd.Variable( torch.zeros(1, title_num_questions, args.hidden_size)), autograd.Variable( torch.zeros((1, title_num_questions, args.hidden_size)))) else: if args.cuda: title_inputs = [ autograd.Variable( torch.FloatTensor(title_embeddings).cuda()) ] else: title_inputs = [ autograd.Variable(torch.FloatTensor(title_embeddings)) ] title_inputs = torch.cat(title_inputs).transpose(0, 1).transpose( 1, 2) if args.model == 'lstm': title_out, title_hidden = lstm(title_inputs, title_hidden) else: title_out = cnn(title_inputs) title_out = F.tanh(title_out) title_out = title_out.transpose(1, 2).transpose(0, 1) average_title_out = average_questions(title_out, titles, padding_id) # body if args.model == 'lstm': if args.cuda: body_inputs = [ autograd.Variable( torch.FloatTensor(body_embeddings).cuda()) ] body_inputs = torch.cat(body_inputs).view( body_length, body_num_questions, -1) body_hidden = (autograd.Variable( torch.zeros(1, body_num_questions, args.hidden_size).cuda()), autograd.Variable( torch.zeros((1, body_num_questions, args.hidden_size)).cuda())) else: body_inputs = [ autograd.Variable(torch.FloatTensor(body_embeddings)) ] body_inputs = torch.cat(body_inputs).view( body_length, body_num_questions, -1) body_hidden = (autograd.Variable( torch.zeros(1, body_num_questions, args.hidden_size)), autograd.Variable( torch.zeros((1, body_num_questions, args.hidden_size)))) else: if args.cuda: body_inputs = [ autograd.Variable( torch.FloatTensor(body_embeddings).cuda()) ] else: body_inputs = [ autograd.Variable(torch.FloatTensor(body_embeddings)) ] body_inputs = torch.cat(body_inputs).transpose(0, 1).transpose(1, 2) if args.model == 'lstm': body_out, body_hidden = lstm(body_inputs, body_hidden) else: body_out = cnn(body_inputs) body_out = F.tanh(body_out) body_out = body_out.transpose(1, 2).transpose(0, 1) # average all words of each question from body_out average_body_out = average_questions(body_out, bodies, padding_id) # average body and title # representations of the questions as found by the LSTM # 560 x 100 hidden = (average_title_out + average_body_out) * 0.5 query = hidden[0].unsqueeze(0) examples = hidden[1:] cos_similarity = F.cosine_similarity(query, examples, dim=1) cos_similarity_np = cos_similarity.cpu().data.numpy() ranked_similarities = np.argsort(-1 * cos_similarity_np) positive_similarity = qlabels[ranked_similarities] similarities.append(positive_similarity) evaluator = Evaluation(similarities) metrics = [ epoch, evaluator.MAP(), evaluator.MRR(), str(evaluator.Precision(1)), str(evaluator.Precision(5)) ] print "precision at 1: " + str(evaluator.Precision(1)) print "precision at 5: " + str(evaluator.Precision(5)) print "MAP: " + str(evaluator.MAP()) print "MRR: " + str(evaluator.MRR()) with open(os.path.join(sys.path[0], args.results_file), 'a') as evaluate_file: writer = csv.writer(evaluate_file, dialect='excel') writer.writerow(metrics)