Esempio n. 1
0
def main():
    ###############################################################################
    # Load data
    ###############################################################################

    dictionary = data.Dictionary()
    train_corpus = data.Corpus(dictionary)
    dev_corpus = data.Corpus(dictionary)
    test_corpus = data.Corpus(dictionary)

    task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task]
    for task in task_names:
        skip_first_line = True if task == 'sick' else False
        train_corpus.parse(task,
                           args.data,
                           'train.txt',
                           args.tokenize,
                           num_examples=args.max_example,
                           skip_first_line=skip_first_line)
        if task == 'multinli':
            dev_corpus.parse(task, args.data, 'dev_matched.txt', args.tokenize)
            dev_corpus.parse(task, args.data, 'dev_mismatched.txt',
                             args.tokenize)
            test_corpus.parse(task,
                              args.data,
                              'test_matched.txt',
                              args.tokenize,
                              is_test_corpus=False)
            test_corpus.parse(task,
                              args.data,
                              'test_mismatched.txt',
                              args.tokenize,
                              is_test_corpus=False)
        else:
            dev_corpus.parse(task,
                             args.data,
                             'dev.txt',
                             args.tokenize,
                             skip_first_line=skip_first_line)
            test_corpus.parse(task,
                              args.data,
                              'test.txt',
                              args.tokenize,
                              is_test_corpus=False,
                              skip_first_line=skip_first_line)

    print('train set size = ', len(train_corpus.data))
    print('development set size = ', len(dev_corpus.data))
    print('test set size = ', len(test_corpus.data))
    print('vocabulary size = ', len(dictionary))

    # save the dictionary object to use during testing
    helper.save_object(dictionary,
                       args.save_path + args.task + '_dictionary.pkl')

    embeddings_index = helper.load_word_embeddings(args.word_vectors_directory,
                                                   args.word_vectors_file,
                                                   dictionary.word2idx)
    print('number of OOV words = ', len(dictionary) - len(embeddings_index))

    # ###############################################################################
    # # Build the model
    # ###############################################################################

    model = SentenceClassifier(dictionary, embeddings_index, args)
    optim_fn, optim_params = helper.get_optimizer(args.optimizer)
    optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()),
                         **optim_params)
    best_acc = 0

    if args.cuda:
        model = model.cuda()

    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc = checkpoint['best_acc']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # ###############################################################################
    # # Train the model
    # ###############################################################################

    train = Train(model, optimizer, dictionary, embeddings_index, args,
                  best_acc)
    bestmodel = train.train_epochs(train_corpus, dev_corpus, args.start_epoch,
                                   args.epochs)
    test_batches = helper.batchify(test_corpus.data, args.batch_size)
    if 'multinli' in task_names:
        print(
            'Skipping evaluating best model. Evaluate using the test script.')
    else:
        test_accuracy, test_f1 = evaluate(bestmodel, test_batches, dictionary)
        print('accuracy: %.2f%%' % test_accuracy)
        print('f1: %.2f%%' % test_f1)
Esempio n. 2
0
    if outfile:
        target_names = ['entailment', 'neutral', 'contradiction']
        with open(outfile, 'w') as f:
            f.write('pairID,gold_label' + '\n')
            for item in output:
                f.write(str(item[0]) + ',' + target_names[item[1]] + '\n')
    else:
        return 100. * n_correct / n_total, 100. * f1_score(numpy.asarray(y_true), numpy.asarray(y_preds),
                                                           average='weighted')


if __name__ == "__main__":
    dictionary = helper.load_object(args.save_path + 'dictionary.p')
    embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file,
                                                   dictionary.word2idx)
    model = SentenceClassifier(dictionary, embeddings_index, args)
    if args.cuda:
        model = model.cuda()
    helper.load_model_states_from_checkpoint(model, args.save_path + 'model_best.pth.tar', 'state_dict', args.cuda)
    print('vocabulary size = ', len(dictionary))

    task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task]
    for task in task_names:
        if task == 'multinli' and args.test != 'train':
            for partition in ['_matched', '_mismatched']:
                test_corpus = data.Corpus(dictionary)
                test_corpus.parse(args.data + task + '/', args.test + partition + '.txt', args.tokenize,
                                  is_test_corpus=True)
                print('[' + partition[1:] + '] dataset size = ', len(test_corpus.data))
                test_batches = helper.batchify(test_corpus.data, args.batch_size)
                if args.test == 'test':
ancestor4goAll3 = pickle.load(open('GOANCESTORS_full3ont.pickle', "rb"))

if __name__ == "__main__":

    filefullpath = args.scoreOutput + args.nameExpression + str(
        args.pairStartIndex) + "." + str(args.pairEndIndex) + ".txt"  #

    print("loading dictionary/embedding")
    dictionary = helper.load_object(args.save_path + 'gene_dictionary.p')
    embeddings_index = helper.load_word_embeddings(args.word_vectors_directory,
                                                   args.word_vectors_file,
                                                   dictionary.word2idx)
    print("loading model")
    # print (args)
    model = SentenceClassifier(dictionary,
                               embeddings_index,
                               args,
                               select_method='max')
    if args.cuda:
        model = model.cuda()
    helper.load_model_states_from_checkpoint(
        model, args.save_path + 'model_best.pth.tar', 'state_dict', args.cuda)
    print('vocabulary size = ', len(dictionary))

    annotationBP = pickle.load(
        open(args.goAnnotationFile + "goBP.cPickle", "rb"))
    annotationCC = pickle.load(
        open(args.goAnnotationFile + "goCC.cPickle", "rb"))
    annotationMF = pickle.load(
        open(args.goAnnotationFile + "goMF.cPickle", "rb"))
    annotationAll3 = pickle.load(
        open(args.goAnnotationFile + "go3ontology.cPickle", "rb"))
Esempio n. 4
0
print('train set size = ', len(train_corpus.data))
print('development set size = ', len(dev_corpus.data))
print('test set size = ', len(test_corpus.data))
print('vocabulary size = ', len(dictionary))

# save the dictionary object to use during testing
helper.save_object(dictionary, args.save_path + 'dictionary.p')

embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx)
print('number of OOV words = ', len(dictionary) - len(embeddings_index))

# ###############################################################################
# # Build the model
# ###############################################################################

model = SentenceClassifier(dictionary, embeddings_index, args)
optim_fn, optim_params = helper.get_optimizer(args.optimizer)
optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()), **optim_params)
best_acc = 0

# for training on multiple GPUs. use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use
if 'CUDA_VISIBLE_DEVICES' in os.environ:
    cuda_visible_devices = [int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',')]
    if len(cuda_visible_devices) > 1:
        model = torch.nn.DataParallel(model, device_ids=cuda_visible_devices)
if args.cuda:
    model = model.cuda()

if args.resume:
    if os.path.isfile(args.resume):
        print("=> loading checkpoint '{}'".format(args.resume))