###############################################################################
# Load data
###############################################################################

# load train and dev dataset
train_corpus = data.Corpus(args.tokenize)
dev_corpus = data.Corpus(args.tokenize)
test_corpus = data.Corpus(args.tokenize)

task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task]
for task in task_names:
    if 'IMDB' in task:
        ###############################################################################
        # Load Learning to Skim paper's Pickle file
        ###############################################################################
        train_d, dev_d, test_d = helper.get_splited_imdb_data(
            args.output_base_path + 'IMDB/' + 'imdb.p', SAG=args.SAG)
        train_corpus.parse(train_d, task, args.max_example)
        dev_corpus.parse(dev_d, task, args.max_example)
        test_corpus.parse(test_d, task, args.max_example)
    else:
        train_corpus.parse(args.data + task + '/train.txt', task,
                           args.max_example)
        if task == 'multinli':
            dev_corpus.parse(args.data + task + '/dev_matched.txt', task,
                             args.tokenize)
            test_corpus.parse(args.data + task + '/test_matched.txt', task,
                              args.tokenize)
        else:
            dev_corpus.parse(args.data + task + '/dev.txt', task,
                             args.tokenize)
            test_corpus.parse(args.data + task + '/test.txt', task,
    if args.cuda:
        torch.cuda.set_device(args.gpu)
        model = model.cuda()
    print('loading model')
    helper.load_model(model, model_path, 'state_dict', args.cuda)

    print('vocabulary size = ', len(dictionary))

    task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task]
    for task in task_names:
        test_corpus = data.Corpus(args.tokenize)
        if 'IMDB' in args.task:
            ###############################################################################
            # Load Learning to Skim paper's Pickle file
            ###############################################################################
            train_d, dev_d, test_d = helper.get_splited_imdb_data(args.output_base_path+task+'/'+'imdb.p')
            test_corpus.parse(test_d, task, args.max_example)

            # test_corpus.parse(args.output_base_path + task + '/' + args.test + '.txt', 'RT', args.max_example) #although IMDB but selected text saved by budget model from theano in 'RT' format

        elif task == 'multinli' and args.test != 'train':
            for partition in ['_matched', '_mismatched']:
                test_corpus.parse(args.data + task + '/' + args.test + partition + '.txt', task, args.max_example)
                print('[' + partition[1:] + '] dataset size = ', len(test_corpus.data))
                test_batches = helper.batchify(test_corpus.data, args.eval_batch_size)
                if args.test == 'test':
                    evaluate(model, test_batches, dictionary, args.save_path + args.task + partition + '.csv')
                else:
                    test_accuracy, test_f1 = evaluate(model, test_batches, dictionary)
                    print('[' + partition[1:] + '] accuracy: %.2f%%' % test_accuracy)
                    print('[' + partition[1:] + '] f1: %.2f%%' % test_f1)
    model = BCN(dictionary, embeddings_index, class_distributions, args)
    if args.cuda and torch.cuda.is_available():
        torch.cuda.set_device(args.gpu)
        model = model.cuda()
    helper.load_model_states_from_checkpoint(
        model, args.save_path + 'model_best.pth.tar', 'state_dict', args.cuda)
    print('vocabulary size = ', len(dictionary))

    task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task]
    for task in task_names:
        test_corpus = data.Corpus(args.tokenize)
        if 'imdb' in args.task:
            ###############################################################################
            # Load Learning to Skim paper's Pickle file
            ###############################################################################
            train_d, dev_d, test_d = helper.get_splited_imdb_data(
                args.save_path + 'data/' + 'imdb.p')
            test_corpus.parse(test_d, task, args.max_example)

        elif task == 'multinli' and args.test != 'train':
            for partition in ['_matched', '_mismatched']:
                test_corpus.parse(
                    args.data + task + '/' + args.test + partition + '.txt',
                    task, args.max_example)
                print('[' + partition[1:] + '] dataset size = ',
                      len(test_corpus.data))
                test_batches = helper.batchify(test_corpus.data,
                                               args.batch_size)
                if args.test == 'test':
                    evaluate(model, test_batches, dictionary,
                             args.save_path + args.task + partition + '.csv')
                else:
###############################################################################

# load train and dev dataset
train_corpus = data.Corpus(args.tokenize)
train_corpus_temp = data.Corpus(args.tokenize)
dev_corpus = data.Corpus(args.tokenize)
test_corpus = data.Corpus(args.tokenize)
ori_train_size = -1

task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task]
for task in task_names:
    if 'IMDB' in task:
        ###############################################################################
        # Load Learning to Skim paper's Pickle file
        ###############################################################################
        train_d, dev_d, test_d = helper.get_splited_imdb_data(
            args.output_base_path + task + '/' + 'imdb.p', SAG=args.SAG)
        train_corpus_temp.parse(train_d, task, args.max_example)
        dev_corpus.parse(dev_d, task, args.max_example)
        test_corpus.parse(test_d, task, args.max_example)
        ori_train_size = len(train_corpus_temp.data)

    else:
        train_corpus_temp.parse(args.output_base_path + task + '/train.txt',
                                task, args.max_example)
        ori_train_size = len(train_corpus_temp.data)
        if task == 'multinli':
            dev_corpus.parse(args.output_base_path + task + '/dev_matched.txt',
                             task, args.tokenize)
            test_corpus.parse(
                args.output_base_path + task + '/test_matched.txt', task,
                args.tokenize)