def evaluate( model_path, corpus_path, pairs_path, batch_size=100, ): model = torch.load(model_path) model = model.cuda() model.eval() corpus = Corpus([tuple([corpus_path, os.path.dirname(corpus_path)])]) pairs_batch_loader = FileLoader( [tuple([pairs_path, os.path.dirname(pairs_path)])], batch_size) code = [] nl = [] for data in tqdm.tqdm(pairs_batch_loader): data = map(corpus.get, data) batch = (make_batch(model.embedding_layer, data[0][0]), make_batch(model.embedding_layer, data[1][0])) batch = [x.cuda() for x in batch] batch = (Variable(batch[0], volatile=True), Variable(batch[1], volatile=True)) # embed code and NL repr_left = model(batch[0]) repr_right = model(batch[1]) # accumulate for evaluation code.extend(repr_left.cpu().data.numpy()) nl.extend(repr_right.cpu().data.numpy()) code = np.array(code) nl = np.array(nl) sim_mat = cosine_similarity(nl, code) ans_locs = location_of_correct(sim_mat) summary = {} mr = np.mean(ans_locs) mrr = get_mrr(ans_locs) summary["mrr"] = mrr cutoffs = [1, 5, 10] fracs = [] for c in cutoffs: frac = get_fraction_correct_at(ans_locs, c) fracs.append(frac) print("Num obs: {}".format(code.shape[0])) print("Mean Rank: {}".format(mr)) print("MRR: {}".format(mrr)) for c, f in zip(cutoffs, fracs): print("Fraction Correct@{}: {}".format(c, f)) summary["success@{}".format(c)] = f return summary
def train(iter_cnt, model, corpus, args, optimizer): train_writer = FileWriter(args.run_path + "/train", flush_secs=5) pos_file_path = "{}.pos.txt".format(args.train) neg_file_path = "{}.neg.txt".format(args.train) pos_batch_loader = FileLoader( [tuple([pos_file_path, os.path.dirname(args.train)])], args.batch_size) neg_batch_loader = FileLoader( [tuple([neg_file_path, os.path.dirname(args.train)])], args.batch_size) #neg_batch_loader = RandomLoader( # corpus = corpus, # exclusive_set = zip(pos_batch_loader.data_left, pos_batch_loader.data_right), # batch_size = args.batch_size #) #neg_batch_loader = CombinedLoader( # neg_batch_loader_1, # neg_batch_loader_2, # args.batch_size #) use_content = False if args.use_content: use_content = True embedding_layer = model.embedding_layer criterion = model.compute_loss start = time.time() tot_loss = 0.0 tot_cnt = 0 for batch, labels in tqdm( pad_iter(corpus, embedding_layer, pos_batch_loader, neg_batch_loader, use_content, pad_left=False)): iter_cnt += 1 model.zero_grad() labels = labels.type(torch.LongTensor) new_batch = [] if args.use_content: for x in batch: for y in x: new_batch.append(y) batch = new_batch if args.cuda: batch = [x.cuda() for x in batch] labels = labels.cuda() batch = map(Variable, batch) labels = Variable(labels) repr_left = None repr_right = None if not use_content: repr_left = model(batch[0]) repr_right = model(batch[1]) else: repr_left = model(batch[0]) + model(batch[1]) repr_right = model(batch[2]) + model(batch[3]) output = model.compute_similarity(repr_left, repr_right) loss = criterion(output, labels) loss.backward() prev_emb = embedding_layer.embedding.weight.cpu().data.numpy() optimizer.step() current_emb = embedding_layer.embedding.weight.cpu().data.numpy() diff = np.sum(np.absolute(current_emb - prev_emb)) tot_loss += loss.data[0] * output.size(0) tot_cnt += output.size(0) if iter_cnt % 100 == 0: outputManager.say("\r" + " " * 50) outputManager.say("\r{} loss: {:.4f} eps: {:.0f} ".format( iter_cnt, tot_loss / tot_cnt, tot_cnt / (time.time() - start))) s = summary.scalar('loss', tot_loss / tot_cnt) train_writer.add_summary(s, iter_cnt) outputManager.say("\n") train_writer.close() #if model.criterion.startswith('classification'): # print model.output_op.weight.min().data[0], model.output_op.weight.max().data[0] return iter_cnt
def evaluate(iter_cnt, filepath, model, corpus, args, logging=True): if logging: valid_writer = FileWriter(args.run_path + "/valid", flush_secs=5) pos_file_path = "{}.pos.txt".format(filepath) neg_file_path = "{}.neg.txt".format(filepath) pos_batch_loader = FileLoader( [tuple([pos_file_path, os.path.dirname(args.eval)])], args.batch_size) neg_batch_loader = FileLoader( [tuple([neg_file_path, os.path.dirname(args.eval)])], args.batch_size) batchify = lambda bch: make_batch(model.embedding_layer, bch) model.eval() criterion = model.compute_loss auc_meter = AUCMeter() scores = [np.asarray([], dtype='float32') for i in range(2)] for loader_id, loader in tqdm( enumerate((neg_batch_loader, pos_batch_loader))): for data in tqdm(loader): data = map(corpus.get, data) batch = None if not args.eval_use_content: batch = (batchify(data[0][0]), batchify(data[1][0])) else: batch = (map(batchify, data[0]), map(batchify, data[1])) new_batch = [] for x in batch: for y in x: new_batch.append(y) batch = new_batch labels = torch.ones(batch[0].size(1)).type( torch.LongTensor) * loader_id if args.cuda: batch = [x.cuda() for x in batch] labels = labels.cuda() if not args.eval_use_content: batch = (Variable(batch[0], volatile=True), Variable(batch[1], volatile=True)) else: batch = (Variable(batch[0], volatile=True), Variable(batch[1], volatile=True), Variable(batch[2], volatile=True), Variable(batch[3], volatile=True)) labels = Variable(labels) if not args.eval_use_content: repr_left = model(batch[0]) repr_right = model(batch[1]) else: repr_left = model(batch[0]) + model(batch[1]) repr_right = model(batch[2]) + model(batch[3]) output = model.compute_similarity(repr_left, repr_right) if model.criterion.startswith('classification'): assert output.size(1) == 2 output = nn.functional.log_softmax(output) current_scores = -output[:, loader_id].data.cpu().squeeze( ).numpy() output = output[:, 1] else: assert output.size(1) == 1 current_scores = output.data.cpu().squeeze().numpy() auc_meter.add(output.data, labels.data) scores[loader_id] = np.append(scores[loader_id], current_scores) auc_score = auc_meter.value() auc10_score = auc_meter.value(0.1) auc05_score = auc_meter.value(0.05) auc02_score = auc_meter.value(0.02) auc01_score = auc_meter.value(0.01) if model.criterion.startswith('classification'): avg_score = (scores[1].mean() + scores[0].mean()) * 0.5 else: avg_score = scores[1].mean() - scores[0].mean() outputManager.say( "\r[{}] auc(.01): {:.3f} auc(.02): {:.3f} auc(.05): {:.3f}" " auc(.1): {:.3f} auc: {:.3f}" " scores: {:.2f} ({:.2f} {:.2f})\n".format( os.path.basename(filepath).split('.')[0], auc01_score, auc02_score, auc05_score, auc10_score, auc_score, avg_score, scores[1].mean(), scores[0].mean())) if logging: s = summary.scalar('auc', auc_score) valid_writer.add_summary(s, iter_cnt) s = summary.scalar('auc (fpr<0.1)', auc10_score) valid_writer.add_summary(s, iter_cnt) s = summary.scalar('auc (fpr<0.05)', auc05_score) valid_writer.add_summary(s, iter_cnt) s = summary.scalar('auc (fpr<0.02)', auc02_score) valid_writer.add_summary(s, iter_cnt) s = summary.scalar('auc (fpr<0.01)', auc01_score) valid_writer.add_summary(s, iter_cnt) valid_writer.close() return auc05_score
fb.write(json_data + '\n') if __name__ == "__main__": parse = argparse.ArgumentParser() parse.add_argument('--dataset', default='ENZYMES', type=str, help='dataset') args = parse.parse_args() config_file = osp.join(osp.dirname(osp.abspath(__file__)), 'config', '%s.ini' % args.dataset) config = Config(config_file) set_seed(config.seed) G_data = FileLoader(args.dataset, config).load_data() training_process_data_file = generate_result_file_name( args.dataset, config) check_dir(training_process_data_file) train_fb = open(training_process_data_file, 'a+', encoding='utf-8') for fold_idx in range(config.fold): G_data.use_fold_data(fold_idx) train_graphs, test_graphs = G_data.train_graphs, G_data.test_graphs print('start training ------> fold', fold_idx + 1) print('train sample number: {} test sample number: {}'.format( len(train_graphs), len(test_graphs))) app_run(train_graphs, test_graphs, args.dataset, fold_idx, train_fb, config) print()
model = Net(config, convolution_method) trainer = Trainer(config, model, G_data) trainer.train(acc_file, fold_idx) if __name__ == "__main__": parse = argparse.ArgumentParser() parse.add_argument('--dataset', default='NCI109', type=str, help='dataset') parse.add_argument('--convolution', default='GCN', type=str, help='GCN, GAT or GraphSage') args = parse.parse_args() config_file = osp.join(osp.dirname(osp.abspath(__file__)), 'config', '%s.ini' % args.dataset) config = Config(config_file) set_seed(config.seed) G_data = FileLoader(args.dataset, config).load_data() acc_file = osp.join(osp.dirname(osp.abspath(__file__)), 'result', args.convolution, args.dataset + '_result.txt') check_dir(acc_file) for fold_idx in range(config.fold): print('start training ------> fold', fold_idx + 1) start = time.time() app_run(config, G_data, fold_idx, acc_file, args.convolution) print('Total time cost in this fold: {:.2f}s'.format(time.time() - start)) print() calculate_final_result(args.dataset, acc_file, args.convolution)
def train(iter_cnt, model, domain_d, corpus, args, optimizer_encoder, optimizer_domain_d): train_writer = FileWriter(args.run_path + "/train", flush_secs=5) pos_file_path = "{}.pos.txt".format(args.train) neg_file_path = "{}.neg.txt".format(args.train) # for adversarial training just use natural language portions of inputs train_corpus_path = os.path.dirname(args.train) + "/nl.tsv.gz" cross_train_corpus_path = os.path.dirname(args.cross_train) + "/nl.tsv.gz" use_content = False if args.use_content: use_content = True pos_batch_loader = FileLoader( [tuple([pos_file_path, os.path.dirname(args.train)])], args.batch_size) neg_batch_loader = FileLoader( [tuple([neg_file_path, os.path.dirname(args.train)])], args.batch_size) cross_loader = TwoDomainLoader( [tuple([train_corpus_path, os.path.dirname(train_corpus_path)])], [ tuple([ cross_train_corpus_path, os.path.dirname(cross_train_corpus_path) ]) ], args.batch_size * 2) embedding_layer = model.embedding_layer criterion1 = model.compute_loss criterion2 = domain_d.compute_loss start = time.time() task_loss = 0.0 task_cnt = 0 domain_loss = 0.0 dom_cnt = 0 total_loss = 0.0 total_cnt = 0 for batch, labels, domain_batch, domain_labels in tqdm( cross_pad_iter(corpus, embedding_layer, pos_batch_loader, neg_batch_loader, cross_loader, use_content, pad_left=False)): iter_cnt += 1 new_batch = [] if args.use_content: for x in batch: for y in x: new_batch.append(y) batch = new_batch domain_batch = [x for x in domain_batch] if args.cuda: batch = [x.cuda() for x in batch] labels = labels.cuda() if not use_content: domain_batch = domain_batch.cuda() else: domain_batch = [x.cuda() for x in domain_batch] domain_labels = domain_labels.cuda() batch = map(Variable, batch) labels = Variable(labels) if not use_content: domain_batch = Variable(domain_batch) else: domain_batch = map(Variable, domain_batch) domain_labels = Variable(domain_labels) model.zero_grad() domain_d.zero_grad() repr_left = None repr_right = None if not use_content: repr_left = model(batch[0]) repr_right = model(batch[1]) else: repr_left = model(batch[0]) + model(batch[1]) repr_right = model(batch[2]) + model(batch[3]) output = model.compute_similarity(repr_left, repr_right) loss1 = criterion1(output, labels) task_loss += loss1.data[0] * output.size(0) task_cnt += output.size(0) domain_output = None if not use_content: domain_output = domain_d(model(domain_batch)) else: domain_output = domain_d(model(domain_batch[0])) + domain_d( model(domain_batch[1])) loss2 = criterion2(domain_output, domain_labels) domain_loss += loss2.data[0] * domain_output.size(0) dom_cnt += domain_output.size(0) loss = loss1 - args.lambda_d * loss2 total_loss += loss.data[0] total_cnt += 1 loss.backward() optimizer_encoder.step() optimizer_domain_d.step() if iter_cnt % 100 == 0: outputManager.say("\r" + " " * 50) outputManager.say( "\r{} tot_loss: {:.4f} task_loss: {:.4f} domain_loss: {:.4f} eps: {:.0f} " .format(iter_cnt, total_loss / total_cnt, task_loss / task_cnt, domain_loss / dom_cnt, (task_cnt + dom_cnt) / (time.time() - start))) s = summary.scalar('total_loss', total_loss / total_cnt) train_writer.add_summary(s, iter_cnt) s = summary.scalar('domain_loss', domain_loss / dom_cnt) train_writer.add_summary(s, iter_cnt) s = summary.scalar('task_loss', task_loss / task_cnt) train_writer.add_summary(s, iter_cnt) outputManager.say("\n") train_writer.close() return iter_cnt