def plot_laplace(args, vocab, train_toks, val_toks):
    alpha_ = [0.00001*pow(10,i) for i in range(7)]
    train_per = []
    val_per = []

    for i in range(7):
            # The language model assumes a thresholded vocab.
        print("--------- For alpha = ", alpha_[i], "----------")
        lm = util.BigramLanguageModel(vocab, args.unk, args.smoothing,
                                      alpha=alpha_[i], beta=args.beta)

        # Estimate parameters.
        lm.train(train_toks)

        train_ppl = lm.test(train_toks)
        val_ppl = lm.test(val_toks)
        print('Train perplexity: %f\nVal Perplexity: %f' %(train_ppl, val_ppl))
        train_per.append(train_ppl)
        val_per.append(val_ppl)

    # pdb.set_trace()
    plt.figure(figsize=(20,5))
    plt.close()
    plt.plot(np.arange(7), train_per, '-ro', label='Train Perplexity', linewidth=1)
    plt.plot(np.arange(7), val_per, '-bo', label='Val Perplexity', linewidth=1)
    plt.xticks(np.arange(7), labels = alpha_)
    plt.legend(loc="upper left")
    plt.xlabel('alpha')
    plt.show()
    plt.savefig('Laplace_train_val_perp.pdf', bbox_inches='tight')
    print('-' * 79)
def plot_interpolation(args, vocab, train_toks, val_toks):
    beta_ = np.arange(0.1,1,0.1)
    train_per = []
    val_per = []

    for i in range(9):
            # The language model assumes a thresholded vocab.
        print("--------- beta = ", beta_[i], "----------")
        lm = util.BigramLanguageModel(vocab, args.unk, args.smoothing,
                                      alpha=args.beta, beta=beta_[i])

        # Estimate parameters.
        lm.train(train_toks)

        train_ppl = lm.test(train_toks)
        val_ppl = lm.test(val_toks)
        print('Train perplexity: %f\nVal Perplexity: %f' %(train_ppl, val_ppl))
        train_per.append(train_ppl)
        val_per.append(val_ppl)

    # pdb.set_trace()
    # plt.figure(figsize=(20,5))
    plt.close()
    plt.plot(beta_, train_per, '-ro', label='Train Perplexity', linewidth=1)
    plt.plot(beta_, val_per, '-bo', label='Val Perplexity', linewidth=1)
    # plt.xticks(np.arange(7), labels = alpha_)
    plt.legend(loc="upper right")
    plt.xlabel('beta')
    plt.show()
    plt.savefig('interpolation.pdf', bbox_inches='tight')
    print('-' * 79)
Ejemplo n.º 3
0
def main(args):
    tokenizer = util.Tokenizer(tokenize_type=args.tok, lowercase=True)

    # TODO: you have to pass this test.
    util.test_ngram_counts(tokenizer)

    train_toks = tokenizer.tokenize(open(args.train_file).read())
    num_train_toks = int(args.train_fraction * len(train_toks))
    print('-' * 79)
    print('Using %d tokens for training (%g%% of %d)' %
          (num_train_toks, 100 * args.train_fraction, len(train_toks)))
    train_toks = train_toks[:int(args.train_fraction * len(train_toks))]
    val_toks = tokenizer.tokenize(open(args.val_file).read())

    train_ngram_counts = tokenizer.count_ngrams(train_toks)

    # Explore n-grams in the training corpus before preprocessing.
    util.show_ngram_information(train_ngram_counts, args.k, args.figure_file,
                                args.quiet)

    # Get vocab and threshold.
    print('Using vocab size %d (excluding UNK) (original %d)' % (min(
        args.vocab, len(train_ngram_counts[0])), len(train_ngram_counts[0])))
    vocab = [
        tup[0] for tup, _ in train_ngram_counts[0].most_common(args.vocab)
    ]
    train_toks = tokenizer.threshold(train_toks, vocab, args.unk)
    val_toks = tokenizer.threshold(val_toks, vocab, args.unk)

    # The language model assumes a thresholded vocab.
    lm = util.BigramLanguageModel(vocab,
                                  args.unk,
                                  args.smoothing,
                                  alpha=args.alpha,
                                  beta=args.beta)

    # Estimate parameters.
    lm.train(train_toks)

    train_ppl = lm.test(train_toks)
    val_ppl = lm.test(val_toks)
    print('Train perplexity: %f\nVal Perplexity: %f' % (train_ppl, val_ppl))
Ejemplo n.º 4
0
def main(args):
    tokenizer = util.Tokenizer(tokenize_type=args.tok, lowercase=True)

    # TODO: you have to pass this test.
    util.test_ngram_counts(tokenizer)

    train_toks = tokenizer.tokenize(open(args.train_file).read())
    minValidation = 10000
    minA = 10000
    minB = 10000
    minTF = 10000

    ###Problem 6###
    # for frac in range(1,11):
    #     args.train_fraction = float(frac/10)

    num_train_toks = int(args.train_fraction * len(train_toks))
    print('-' * 79)
    print('Using %d tokens for training (%g%% of %d)' %
          (num_train_toks, 100 * args.train_fraction, len(train_toks)))
    train_toks = train_toks[:int(args.train_fraction * len(train_toks))]
    val_toks = tokenizer.tokenize(open(args.val_file).read())

    train_ngram_counts = tokenizer.count_ngrams(train_toks)

    # Explore n-grams in the training corpus before preprocessing.
    util.show_ngram_information(train_ngram_counts, args.k, args.figure_file,
                                args.quiet)

    # Get vocab and threshold.
    print('Using vocab size %d (excluding UNK) (original %d)' % (min(
        args.vocab, len(train_ngram_counts[0])), len(train_ngram_counts[0])))
    vocab = [
        tup[0] for tup, _ in train_ngram_counts[0].most_common(args.vocab)
    ]
    train_toks = tokenizer.threshold(train_toks, vocab, args.unk)
    val_toks = tokenizer.threshold(val_toks, vocab, args.unk)

    ###Problem 5###
    # for alpha in range(-5,3):
    #     args.alpha = 10**alpha
    ###Problem 7###
    # for beta in range(1,10):
    #     args.beta = 0.1*beta

    # The language model assumes a thresholded vocab.
    lm = util.BigramLanguageModel(vocab,
                                  args.unk,
                                  args.smoothing,
                                  alpha=args.alpha,
                                  beta=args.beta)
    # Estimate parameters.
    lm.train(train_toks)

    train_ppl = lm.test(train_toks)
    val_ppl = lm.test(val_toks)
    # if val_ppl < minValidation:
    #     minValidation = val_ppl
    #     minA = args.alpha
    #     minB = args.beta
    #     minTF = args.train_fraction
    print('Train perplexity: %f\nVal Perplexity: %f' % (train_ppl, val_ppl))
    # print("\tAlpha: " + str(minA))
    # print("\tBeta: " + str(minB))
    # print("\tTrain Fraction: " + str(minTF))
    f = open("TrainFraction2.csv", "a")
    f.write(str(args.beta) + "," + str(train_ppl) + "," + str(val_ppl) + "\n")
    f.close()