def test_count_grams_bad_kwarg(self): vocab_text = ("the cow jumped over the blue moon . " "blue river jumped over the rainbow .") vocab = build_vocabulary(2, vocab_text.split()) text = ["blue moon".split()] with self.assertRaises(TypeError) as exc_info: count_ngrams(2, vocab, text, dummy_kwarg="TEST") expected_error_msg = "ngrams() got an unexpected keyword argument 'dummy_kwarg'" self.assertEqual(expected_error_msg, str(exc_info.exception))
def test_count_ngrams(self): vocab = build_vocabulary(2, 'abcdead') counter = count_ngrams(2, vocab, ['abcfdezgadbew']) bigrams = counter.ngrams[2] self.assertEqual(bigrams[("a", )]['b'], 0) self.assertEqual(bigrams[("a", )]['d'], 1) self.assertEqual(bigrams[("<s>", )]['a'], 1)
def test_count_ngrams_kwargs(self): vocab_text = ("the cow jumped over the blue moon . " "blue river jumped over the rainbow .") vocab = build_vocabulary(2, vocab_text.split()) text = ["blue moon".split(), "over the rainbow".split()] counter = count_ngrams(2, vocab, text, left_pad_symbol="TEST") self.assertEqual(counter.ngrams[2][("TEST", )]["blue"], 1)
def test_count_ngrams_multiple_texts(self): vocab_text = ("the cow jumped over the blue moon . " "blue river jumped over the rainbow .") vocab = build_vocabulary(2, vocab_text.split()) text1 = ['zabcfdegadbew'] text2 = ["blue moon".split(), "over the rainbow".split()] counter = count_ngrams(2, vocab, text1, text2) bigrams = counter.ngrams[2] self.assertEqual(bigrams[("blue", )]['river'], 0) self.assertEqual(bigrams[("blue", )]['<UNK>'], 1) self.assertEqual(bigrams[("over", )]['the'], 1)
def get_model_entropy(model, train_loader, eval_loader, vocab_size, params=None, order=1): if model == 'unigram': pass else: raise ValueError("Model not implemented: %s" % model) params = params / np.sum(params, keepdims=True) if params else None vocab = build_vocabulary(1, *train_loader) counter = count_ngrams(1, vocab, train_loader, pad_left=False, pad_right=False) model = LaplaceUnigramModel(vocab_size, counter) val_loss = model.get_entropy(eval_loader) return val_loss
def _init_pos_lm(corpus_file): def get_tokens(cf): for line in iter_file(cf): for w in line.split(' '): yield w def get_sentences(cf): for line in iter_file(cf): yield line.split(' ') '''构建词表''' # 词频低于这个值将被认为不是词汇 # 逻辑删除,还保留着这个词的词频 cutoff = 1 tokens = get_tokens(corpus_file) vocab = build_vocabulary(cutoff, tokens) '''统计ngram''' order = 3 sentences = get_sentences(corpus_file) ngram_counter = count_ngrams(order, vocab, sentences) '''ngram转换成score''' ngram_model = MLENgramModel(ngram_counter) return ngram_model
from nltk.corpus import gutenberg #text = open('/Users/purnendu/Desktop/Nat_Lang_HW3/Lin.txt').read() #utext = unicode(text, "utf8") sents = gutenberg.sents('/Users/purnendu/Desktop/Nat_Lang_HW3/LB-Train.txt') words = [w.lower() for s in sents for w in s] words_train_LB = words vocab = build_vocabulary(3, words) #print(sents[:6]) bigram_counts = count_ngrams(2, vocab, sents) #print(bigram_counts.unigrams) LB_model = LaplaceNgramModel(bigram_counts) #ex_score = LB_model.score("administration", ["of"]) #print ex_score sents_test = gutenberg.sents( '/Users/purnendu/Desktop/Nat_Lang_HW3/LB-Test.txt') words_test = [w.lower() for s in sents_test for w in s] print "1.b) perplexity of LB on LB-Test) : ", LB_model.perplexity(words_test) #perplexity of MB on MB-test
Gettysburg=inputfile_Gettysburg.read() inputfile_firstInaugral= open("firstInaugral.txt") firstInaugral=inputfile_firstInaugral.read() lincolnTotal = Gettysburg + firstInaugral inputfile_secondInaugral = open("secondInaugral.txt") #inputfile_secondInaugral.read() LB_Train_Corpus = sents_gettysburg + sents_firstInaugral train_words_lb = [w for s in LB_Train_Corpus for w in s] test_words_lb = [w for s in sents_secondInaugral for w in s] # Remove rare words from the corpus vocab = build_vocabulary(5, train_words_lb) LB_Train=map(lambda x: x in vocab, train_words_lb) LB_Test=map(lambda x: x in vocab, test_words_lb) bigram_counts = count_ngrams(2, vocab, LB_Train_Corpus) LB = LaplaceNgramModel(bigram_counts) #*************************************************** sents_nm_freedom = gutenberg.sents("mandelaFreedom.txt") sents_nm_prepared = gutenberg.sents("mandelaPrepared.txt") sents_nm_anc = gutenberg.sents("mandelaANC.txt") inputfile_mandelaFreedom = open("mandelaFreedom.txt") mandelaFreedom=inputfile_mandelaFreedom.read() inputfile_mandelaPrepared = open("mandelaPrepared.txt") mandelaPrepared=inputfile_mandelaPrepared.read() mandelaTotal = mandelaFreedom + mandelaPrepared inputfile_mandelaANC = open("mandelaANC.txt") # MB_Test=inputfile_mandelaANC.read()
from nltk.model import build_vocabulary from nltk.model import count_ngrams from nltk.model import MLENgramModel from nltk.model import LidstoneNgramModel # load doc into memory raw = open('datasets/WW_Dataset.txt', 'r').read() print(raw[:75]) tokens = word_tokenize(raw) print(len(tokens)) lines = line_tokenize(raw) test_lines = lines[3:5] test_words = [w for s in test_lines for w in s] print(test_words[:5]) corpus = [w.lower() for w in tokens] text = nltk.Text(tokens) words = [w.lower() for w in tokens] print(words[:10]) vocab = sorted(set(words)) print(len(vocab)) spl = int(95*len(corpus)/100) train = text[:spl] test = text[spl:] vocab = build_vocabulary(2, words) bigram_counts = count_ngrams(2, vocab, text) bigram_model = LidstoneNgramModel(3,bigram_counts) #ex_score = bigram_model.score("yawned", ["he"]) print(bigram_model.perplexity("stopped and took the penny up and when the cripple nearer drew quoth andrew under halfacrown what a man finds is all his own and so my friend goodday to show and proud still in that dear vagrants looked up by reason bound as if in habit sympathy their spirit spare more oft love for thee that say stand in all works or their congenial powers that fear as pleasures round the stationary blasts of their sorrow heart those higher years that did i meditate to me and evil sweet respect to paint that musings of vice as high and words of excellence had with beatitude that pure gains and sedate"))
# Train on 95% f the corpus and test on the rest # spl = 95*len(corpus)/100 train = corpus test = secondInaugral; # Remove rare words from the corpus fdist = nltk.FreqDist(w for w in train) vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) train = map(lambda x: x if x in vocabulary else "*unknown*", train) test = map(lambda x: x if x in vocabulary else "*unknown*", test) vocab = build_vocabulary(5, test_words1) print len(vocab) bigram_counts = count_ngrams(2, vocab, test_sents) print "count" # print sorted(bigram_counts.ngrams[2].conditions()) bigram_model = LaplaceNgramModel(bigram_counts); print bigram_model.ngram_counter == bigram_counts # lm = NgramModel(3, brown.words(categories='news'), estimator) print "perplexity(test) =", bigram_model.perplexity(test) print sents1 vocabnm = build_vocabulary(5, test_words_nm) print len(vocabnm) bigram_counts_nm = count_ngrams(2, vocabnm, test_nm_sents) print "count" # print sorted(bigram_counts.ngrams[2].conditions()) bigram_model_nm = LaplaceNgramModel(bigram_counts_nm); print bigram_model_nm.ngram_counter == bigram_counts_nm
# Train on 95% f the corpus and test on the rest # spl = 95*len(corpus)/100 train = corpus test = secondInaugral # Remove rare words from the corpus fdist = nltk.FreqDist(w for w in train) vocabulary = set( map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) train = map(lambda x: x if x in vocabulary else "*unknown*", train) test = map(lambda x: x if x in vocabulary else "*unknown*", test) vocab = build_vocabulary(5, test_words1) print len(vocab) bigram_counts = count_ngrams(2, vocab, test_sents) print "count" # print sorted(bigram_counts.ngrams[2].conditions()) bigram_model = LaplaceNgramModel(bigram_counts) print bigram_model.ngram_counter == bigram_counts # lm = NgramModel(3, brown.words(categories='news'), estimator) print "perplexity(test) =", bigram_model.perplexity(test) print sents1 vocabnm = build_vocabulary(5, test_words_nm) print len(vocabnm) bigram_counts_nm = count_ngrams(2, vocabnm, test_nm_sents) print "count" # print sorted(bigram_counts.ngrams[2].conditions()) bigram_model_nm = LaplaceNgramModel(bigram_counts_nm) print bigram_model_nm.ngram_counter == bigram_counts_nm
Gettysburg = inputfile_Gettysburg.read() inputfile_firstInaugral = open("firstInaugral.txt") firstInaugral = inputfile_firstInaugral.read() lincolnTotal = Gettysburg + firstInaugral inputfile_secondInaugral = open("secondInaugral.txt") #inputfile_secondInaugral.read() LB_Train_Corpus = sents_gettysburg + sents_firstInaugral train_words_lb = [w for s in LB_Train_Corpus for w in s] test_words_lb = [w for s in sents_secondInaugral for w in s] # Remove rare words from the corpus vocab = build_vocabulary(5, train_words_lb) LB_Train = map(lambda x: x in vocab, train_words_lb) LB_Test = map(lambda x: x in vocab, test_words_lb) bigram_counts = count_ngrams(2, vocab, LB_Train_Corpus) LB = LaplaceNgramModel(bigram_counts) #*************************************************** sents_nm_freedom = gutenberg.sents("mandelaFreedom.txt") sents_nm_prepared = gutenberg.sents("mandelaPrepared.txt") sents_nm_anc = gutenberg.sents("mandelaANC.txt") inputfile_mandelaFreedom = open("mandelaFreedom.txt") mandelaFreedom = inputfile_mandelaFreedom.read() inputfile_mandelaPrepared = open("mandelaPrepared.txt") mandelaPrepared = inputfile_mandelaPrepared.read() mandelaTotal = mandelaFreedom + mandelaPrepared inputfile_mandelaANC = open("mandelaANC.txt") # MB_Test=inputfile_mandelaANC.read()