def test_d2_1_gp(): global food_corpus, natr_corpus food_corpus_tk = train.tokenize_corpus(food_corpus) natr_corpus_tk = train.tokenize_corpus(natr_corpus) food_train, food_vocab = padded_everygram_pipeline( 3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))]) natr_train, natr_vocab = padded_everygram_pipeline( 3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))]) food_test = sum([['<s>'] + x + ['</s>'] for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]], []) natr_test = sum([['<s>'] + x + ['</s>'] for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]], []) food_lm = Laplace(3) natr_lm = Laplace(3) food_lm.fit(food_train, food_vocab) natr_lm.fit(natr_train, natr_vocab) eq_(int(evaluate.get_perplexity(food_lm, food_test[:2500])), 7318) eq_(int(evaluate.get_perplexity(food_lm, natr_test[:2500])), 7309) eq_(int(evaluate.get_perplexity(natr_lm, natr_test[:2500])), 5222) eq_(int(evaluate.get_perplexity(natr_lm, food_test[:2500])), 5354)
def test_d1_1_tk(): global food_corpus, natr_corpus food_corpus_tk = train.tokenize_corpus(food_corpus) natr_corpus_tk = train.tokenize_corpus(natr_corpus) eq_(food_corpus_tk[25][5], 'Monday') eq_(natr_corpus_tk[25][5], 'are')
def test_d1_2_pad(): global food_corpus, natr_corpus food_corpus_tk = train.tokenize_corpus(food_corpus) natr_corpus_tk = train.tokenize_corpus(natr_corpus) food_corpus_tk_pd = train.pad_corpus(food_corpus_tk) natr_corpus_tk_pd = train.pad_corpus(natr_corpus_tk) eq_(food_corpus_tk_pd[35][0], '<s>') eq_(natr_corpus_tk_pd[35][-1], '</s>') eq_(len(food_corpus_tk_pd[45]), 14) eq_(len(natr_corpus_tk_pd[45]), 19) eq_(len(food_corpus_tk_pd[45]) - len(food_corpus_tk[45]), 2)
def test_d1_5_es(): global food_corpus, natr_corpus food_corpus_tk = train.tokenize_corpus(food_corpus) natr_corpus_tk = train.tokenize_corpus(natr_corpus) food_corpus_tk_pd = train.pad_corpus(food_corpus_tk) natr_corpus_tk_pd = train.pad_corpus(natr_corpus_tk) food_corpus_tr, food_corpus_te = train.split_corpus(food_corpus_tk_pd) natr_corpus_tr, natr_corpus_te = train.split_corpus(natr_corpus_tk_pd) food_ngrams, food_vocab_man = train.count_ngrams(food_corpus_tr, 3) natr_ngrams, natr_vocab_man = train.count_ngrams(natr_corpus_tr, 3) eq_(train.estimate(food_ngrams, ['palm'], ['producer', 'of']), 0.25) eq_(train.estimate(natr_ngrams, ['basis'], ['tested', 'the']), 0.5)
def test_d1_3_spc(): global food_corpus, natr_corpus food_corpus_tk = train.tokenize_corpus(food_corpus) natr_corpus_tk = train.tokenize_corpus(natr_corpus) food_corpus_tk_pd = train.pad_corpus(food_corpus_tk) natr_corpus_tk_pd = train.pad_corpus(natr_corpus_tk) food_corpus_tr, food_corpus_te = train.split_corpus(food_corpus_tk_pd) natr_corpus_tr, natr_corpus_te = train.split_corpus(natr_corpus_tk_pd) eq_(len(food_corpus_tr), 4888) eq_(len(food_corpus_te), 1222) eq_(len(natr_corpus_tr), 2610) eq_(len(natr_corpus_te), 653) eq_(food_corpus_te[3][5], 'by') eq_(natr_corpus_te[1][2], 'Project')
def test_d1_4_cn(): global food_corpus, natr_corpus food_corpus_tk = train.tokenize_corpus(food_corpus) natr_corpus_tk = train.tokenize_corpus(natr_corpus) food_corpus_tk_pd = train.pad_corpus(food_corpus_tk) natr_corpus_tk_pd = train.pad_corpus(natr_corpus_tk) food_corpus_tr, food_corpus_te = train.split_corpus(food_corpus_tk_pd) natr_corpus_tr, natr_corpus_te = train.split_corpus(natr_corpus_tk_pd) food_ngrams, food_vocab_man = train.count_ngrams(food_corpus_tr, 3) natr_ngrams, natr_vocab_man = train.count_ngrams(natr_corpus_tr, 3) eq_(len(food_ngrams.keys()), 181387) eq_(len(natr_ngrams.keys()), 105612) eq_(food_ngrams[('sold', 'the')], 2) eq_(natr_ngrams[('extracting', 'the')], 2) eq_(len(food_vocab_man), 12728) eq_(len(natr_vocab_man), 8972) eq_(sorted(food_vocab_man)[3200], 'ANALYSTS') eq_(sorted(natr_vocab_man)[3210], 'NGX')
def test_d3_1_vary(): global food_corpus, natr_corpus food_corpus_tk = train.tokenize_corpus(food_corpus) n_gram_orders = [2, 3] train_corpus = food_corpus_tk[:int(0.8 * len(food_corpus_tk))] test_corpus = food_corpus_tk[int(0.8 * len(food_corpus_tk) ):int(0.85 * len(food_corpus_tk))] results = train.vary_ngram(train_corpus, test_corpus, n_gram_orders) eq_(int(results[2]), 7387) eq_(int(results[3]), 7428)