def main(): ############################################################################### # Load data ############################################################################### dictionary = data.Dictionary() train_corpus = data.Corpus(dictionary) dev_corpus = data.Corpus(dictionary) test_corpus = data.Corpus(dictionary) task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task] for task in task_names: skip_first_line = True if task == 'sick' else False train_corpus.parse(task, args.data, 'train.txt', args.tokenize, num_examples=args.max_example, skip_first_line=skip_first_line) if task == 'multinli': dev_corpus.parse(task, args.data, 'dev_matched.txt', args.tokenize) dev_corpus.parse(task, args.data, 'dev_mismatched.txt', args.tokenize) test_corpus.parse(task, args.data, 'test_matched.txt', args.tokenize, is_test_corpus=False) test_corpus.parse(task, args.data, 'test_mismatched.txt', args.tokenize, is_test_corpus=False) else: dev_corpus.parse(task, args.data, 'dev.txt', args.tokenize, skip_first_line=skip_first_line) test_corpus.parse(task, args.data, 'test.txt', args.tokenize, is_test_corpus=False, skip_first_line=skip_first_line) print('train set size = ', len(train_corpus.data)) print('development set size = ', len(dev_corpus.data)) print('test set size = ', len(test_corpus.data)) print('vocabulary size = ', len(dictionary)) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + args.task + '_dictionary.pkl') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print('number of OOV words = ', len(dictionary) - len(embeddings_index)) # ############################################################################### # # Build the model # ############################################################################### model = SentenceClassifier(dictionary, embeddings_index, args) optim_fn, optim_params = helper.get_optimizer(args.optimizer) optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()), **optim_params) best_acc = 0 if args.cuda: model = model.cuda() if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # ############################################################################### # # Train the model # ############################################################################### train = Train(model, optimizer, dictionary, embeddings_index, args, best_acc) bestmodel = train.train_epochs(train_corpus, dev_corpus, args.start_epoch, args.epochs) test_batches = helper.batchify(test_corpus.data, args.batch_size) if 'multinli' in task_names: print( 'Skipping evaluating best model. Evaluate using the test script.') else: test_accuracy, test_f1 = evaluate(bestmodel, test_batches, dictionary) print('accuracy: %.2f%%' % test_accuracy) print('f1: %.2f%%' % test_f1)
if outfile: target_names = ['entailment', 'neutral', 'contradiction'] with open(outfile, 'w') as f: f.write('pairID,gold_label' + '\n') for item in output: f.write(str(item[0]) + ',' + target_names[item[1]] + '\n') else: return 100. * n_correct / n_total, 100. * f1_score(numpy.asarray(y_true), numpy.asarray(y_preds), average='weighted') if __name__ == "__main__": dictionary = helper.load_object(args.save_path + 'dictionary.p') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) model = SentenceClassifier(dictionary, embeddings_index, args) if args.cuda: model = model.cuda() helper.load_model_states_from_checkpoint(model, args.save_path + 'model_best.pth.tar', 'state_dict', args.cuda) print('vocabulary size = ', len(dictionary)) task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task] for task in task_names: if task == 'multinli' and args.test != 'train': for partition in ['_matched', '_mismatched']: test_corpus = data.Corpus(dictionary) test_corpus.parse(args.data + task + '/', args.test + partition + '.txt', args.tokenize, is_test_corpus=True) print('[' + partition[1:] + '] dataset size = ', len(test_corpus.data)) test_batches = helper.batchify(test_corpus.data, args.batch_size) if args.test == 'test':
ancestor4goAll3 = pickle.load(open('GOANCESTORS_full3ont.pickle', "rb")) if __name__ == "__main__": filefullpath = args.scoreOutput + args.nameExpression + str( args.pairStartIndex) + "." + str(args.pairEndIndex) + ".txt" # print("loading dictionary/embedding") dictionary = helper.load_object(args.save_path + 'gene_dictionary.p') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print("loading model") # print (args) model = SentenceClassifier(dictionary, embeddings_index, args, select_method='max') if args.cuda: model = model.cuda() helper.load_model_states_from_checkpoint( model, args.save_path + 'model_best.pth.tar', 'state_dict', args.cuda) print('vocabulary size = ', len(dictionary)) annotationBP = pickle.load( open(args.goAnnotationFile + "goBP.cPickle", "rb")) annotationCC = pickle.load( open(args.goAnnotationFile + "goCC.cPickle", "rb")) annotationMF = pickle.load( open(args.goAnnotationFile + "goMF.cPickle", "rb")) annotationAll3 = pickle.load( open(args.goAnnotationFile + "go3ontology.cPickle", "rb"))
print('train set size = ', len(train_corpus.data)) print('development set size = ', len(dev_corpus.data)) print('test set size = ', len(test_corpus.data)) print('vocabulary size = ', len(dictionary)) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + 'dictionary.p') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print('number of OOV words = ', len(dictionary) - len(embeddings_index)) # ############################################################################### # # Build the model # ############################################################################### model = SentenceClassifier(dictionary, embeddings_index, args) optim_fn, optim_params = helper.get_optimizer(args.optimizer) optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()), **optim_params) best_acc = 0 # for training on multiple GPUs. use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use if 'CUDA_VISIBLE_DEVICES' in os.environ: cuda_visible_devices = [int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] if len(cuda_visible_devices) > 1: model = torch.nn.DataParallel(model, device_ids=cuda_visible_devices) if args.cuda: model = model.cuda() if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume))