def main():
    args = get_args()
    folds = util.get_folds()

    data_file = os.path.join(args.data_language_dir, 'processed.pckl')

    _, _, type_testloader, _ = get_data_loaders_with_folds('types',
                                                           data_file,
                                                           folds,
                                                           batch_size=1)

    word_freqs, word_ranks = get_word_freqs_and_ranks(data_file, folds)

    two_stage_state_folder = os.path.join(args.checkpoint_language_dir,
                                          two_stage_model_name(args))
    generator = load_generator(two_stage_state_folder)
    generator.eval()
    adaptor = Adaptor.load(two_stage_state_folder)

    token_model = get_model('tokens', args)
    type_model = get_model('types', args)

    results = calculate_surprisals(type_testloader, type_model, token_model,\
                                   generator, adaptor, word_freqs, word_ranks)

    results_file = os.path.join(args.results_folder, 'entropy_freq.csv')
    util.overwrite_csv(results_file, results)
Example #2
0
def main():
    args = get_args()
    folds = util.get_folds()

    token_trainloader, token_devloader, _, token_alphabet = \
        get_data_loaders_with_folds('tokens', args.data_file, folds,\
                                        args.batch_size, max_train_tokens=args.max_train_tokens)

    print('Train size: %d Dev size: %d' %
          (len(token_trainloader.dataset), len(token_devloader.dataset)))

    generator, adaptor, two_stage_dev_loss, training_time = \
        initiate_two_stage_training(token_trainloader, token_devloader, token_alphabet, args)

    print('Getting generator training loss')
    generator_train_loss = evaluate_generator(token_trainloader, generator)
    print('Getting generator dev loss')
    generator_dev_loss = evaluate_generator(token_devloader, generator)

    print('Generator training loss: %.4f Dev loss: %.4f' %
          (generator_train_loss, generator_dev_loss))

    two_stage_train_loss = evaluate_adaptor(token_trainloader, generator,
                                            adaptor)

    print('Two-stage model training loss: %.4f Dev loss: %.4f' %
          (two_stage_train_loss, two_stage_dev_loss))

    save_two_stage_training_results(generator, args, two_stage_train_loss, two_stage_dev_loss,\
                                    generator_dev_loss, training_time,\
                                    len(token_trainloader.dataset), len(token_devloader.dataset))
def main():
    args = get_args()
    folds = util.get_folds()

    data = load_data(args.data_file)
    _, sentence_data, alphabet, _ = data
    dev_sentences = sentence_data[folds[1][0]]
    test_sentences = sentence_data[folds[2][0]]
    _, dev_loader, test_loader, _ = get_data_loaders_with_folds('tokens', args.data_file, folds,\
                                                             args.batch_size)

    generator = load_generator(args.two_stage_state_folder)
    adaptor = Adaptor.load(args.two_stage_state_folder)

    run_experiments(dev_sentences,
                    generator,
                    adaptor,
                    alphabet,
                    dev_loader,
                    args,
                    test=False)
    run_experiments(test_sentences,
                    generator,
                    adaptor,
                    alphabet,
                    test_loader,
                    args,
                    test=True)
def get_word_freqs_and_ranks(data_file, folds):
    _, _, token_testloader, _ = get_data_loaders_with_folds('tokens',
                                                            data_file,
                                                            folds,
                                                            batch_size=1)
    word_freqs = calculate_word_freqs(token_testloader.dataset)
    word_ranks = get_word_ranks(token_testloader.dataset)
    return word_freqs, word_ranks
Example #5
0
def main():
    # pylint: disable=all
    args = get_args()
    folds = util.get_folds()

    trainloader, devloader, _, alphabet = \
        get_data_loaders_with_folds('tokens', args.data_file, folds, args.batch_size,\
                                    max_train_tokens=args.max_train_tokens)
    print('Train size: %d Dev size %d' %
          (len(trainloader.dataset), len(devloader.dataset)))

    beta_limit = len(trainloader.dataset) * 2
    if args.beta_limit is not None:
        beta_limit = args.beta_limit

    print('Tuning alpha and beta')
    tuning_results = tune_alpha_and_beta(trainloader, devloader, alphabet, args, args.no_iterations, beta_limit)
    print('Writing tuning results to', args.results_file)
    util.write_csv(args.results_file, tuning_results)
def main():
    args = get_args()
    folds = util.get_folds()
    datasets = ['types', 'tokens']

    model_paths = util.get_dirs(args.eval_path)

    dataloaders = {
        dataset: get_data_loaders_with_folds(dataset, args.data_file, folds,\
                                             args.batch_size)
        for dataset in datasets
    }
    for dataset, dataloader in dataloaders.items():
        trainloader, devloader, testloader, _ = dataloader
        print('Dataset: %s Train size: %d Dev size: %d Test size: %d' %
              (dataset, len(trainloader.dataset), len(
                  devloader.dataset), len(testloader.dataset)))

    results = eval_all(model_paths, dataloaders)
    util.overwrite_csv(args.results_file, results)
Example #7
0
def main():
    args = get_args()
    folds = util.get_folds()

    trainloader, devloader, _, alphabet = get_data_loaders_with_folds(
        args.dataset, args.data_file, folds,
        args.batch_size, max_train_tokens=args.max_train_tokens)

    print('Train size: %d Dev size: %d ' %
          (len(trainloader.dataset), len(devloader.dataset)))

    model = get_model(alphabet, args)
    train(trainloader, devloader, model, args.eval_batches, args.wait_iterations, args.dataset)

    train_loss = evaluate(trainloader, model)
    dev_loss = evaluate(devloader, model)

    print('Final Training loss: %.4f Dev loss: %.4f ' %
          (train_loss, dev_loss))

    save_checkpoints(model, train_loss, dev_loss, len(trainloader.dataset),\
                        len(devloader.dataset), args.generator_path)