def test_count_ngrams_kwargs(self): vocab_text = ("the cow jumped over the blue moon . " "blue river jumped over the rainbow .") vocab = build_vocabulary(2, vocab_text.split()) text = ["blue moon".split(), "over the rainbow".split()] counter = count_ngrams(2, vocab, text, left_pad_symbol="TEST") self.assertEqual(counter.ngrams[2][("TEST", )]["blue"], 1)
def test_count_ngrams(self): vocab = build_vocabulary(2, 'abcdead') counter = count_ngrams(2, vocab, ['abcfdezgadbew']) bigrams = counter.ngrams[2] self.assertEqual(bigrams[("a", )]['b'], 0) self.assertEqual(bigrams[("a", )]['d'], 1) self.assertEqual(bigrams[("<s>", )]['a'], 1)
def test_count_grams_bad_kwarg(self): vocab_text = ("the cow jumped over the blue moon . " "blue river jumped over the rainbow .") vocab = build_vocabulary(2, vocab_text.split()) text = ["blue moon".split()] with self.assertRaises(TypeError) as exc_info: count_ngrams(2, vocab, text, dummy_kwarg="TEST") expected_error_msg = "ngrams() got an unexpected keyword argument 'dummy_kwarg'" self.assertEqual(expected_error_msg, str(exc_info.exception))
def main(in_file_name): sentences = map(lambda x: x.split(' '), open(LYRICS_FILE, 'r').read().split('\n')) text = [val for sub in sentences for val in sub] text = filter(lambda x: x != '', text) vocab = build_vocabulary(1, text) vocab = filter(lambda x: x[1] >= 10, vocab.items()) vocab = map(lambda x: x[0], vocab) print("The vocabulary has %d words in it" % len(vocab)) word_to_index, word_vectors, index_words = load_word_vectors( WORD_VECTOR_FILE, vocab) inf = open(in_file_name, 'r') X = [] for verse in inf: verse = verse.split() verse = map(word_to_index, verse) X.append(verse) print "The mean length of a verse is %d words" % np.mean(map(len, X)) print "The maximum length of a verse is %d words" % max(map(len, X)) print "The minimum length of a verse is %d words" % min(map(len, X)) min_len = 150 max_len = 300 X = filter(lambda x: len(x) >= min_len and len(x) <= max_len, X) X = map(lambda x: x + (max_len - len(x)) * [word_to_index('<pad>')], X) y = np.array(map(lambda x: x[1:] + [word_to_index('</s>')], X), dtype=np.int32) X = np.array(X, dtype=np.int32) print "The training matrix is %dx%d" % (X.shape) with h5py.File('./data/data.hdf5', 'w') as f: f['X'] = X f['y'] = y f['word_vectors'] = word_vectors index_words_file = open('data/index_words.txt', 'w+') for word in index_words: index_words_file.write(word + '\n')
def test_count_ngrams_multiple_texts(self): vocab_text = ("the cow jumped over the blue moon . " "blue river jumped over the rainbow .") vocab = build_vocabulary(2, vocab_text.split()) text1 = ['zabcfdegadbew'] text2 = ["blue moon".split(), "over the rainbow".split()] counter = count_ngrams(2, vocab, text1, text2) bigrams = counter.ngrams[2] self.assertEqual(bigrams[("blue", )]['river'], 0) self.assertEqual(bigrams[("blue", )]['<UNK>'], 1) self.assertEqual(bigrams[("over", )]['the'], 1)
def get_model_entropy(model, train_loader, eval_loader, vocab_size, params=None, order=1): if model == 'unigram': pass else: raise ValueError("Model not implemented: %s" % model) params = params / np.sum(params, keepdims=True) if params else None vocab = build_vocabulary(1, *train_loader) counter = count_ngrams(1, vocab, train_loader, pad_left=False, pad_right=False) model = LaplaceUnigramModel(vocab_size, counter) val_loss = model.get_entropy(eval_loader) return val_loss
def _init_pos_lm(corpus_file): def get_tokens(cf): for line in iter_file(cf): for w in line.split(' '): yield w def get_sentences(cf): for line in iter_file(cf): yield line.split(' ') '''构建词表''' # 词频低于这个值将被认为不是词汇 # 逻辑删除,还保留着这个词的词频 cutoff = 1 tokens = get_tokens(corpus_file) vocab = build_vocabulary(cutoff, tokens) '''统计ngram''' order = 3 sentences = get_sentences(corpus_file) ngram_counter = count_ngrams(order, vocab, sentences) '''ngram转换成score''' ngram_model = MLENgramModel(ngram_counter) return ngram_model
from nltk.model import count_ngrams from nltk.model import LaplaceNgramModel from nltk.corpus import gutenberg #text = open('/Users/purnendu/Desktop/Nat_Lang_HW3/Lin.txt').read() #utext = unicode(text, "utf8") sents = gutenberg.sents('/Users/purnendu/Desktop/Nat_Lang_HW3/LB-Train.txt') words = [w.lower() for s in sents for w in s] words_train_LB = words vocab = build_vocabulary(3, words) #print(sents[:6]) bigram_counts = count_ngrams(2, vocab, sents) #print(bigram_counts.unigrams) LB_model = LaplaceNgramModel(bigram_counts) #ex_score = LB_model.score("administration", ["of"]) #print ex_score sents_test = gutenberg.sents( '/Users/purnendu/Desktop/Nat_Lang_HW3/LB-Test.txt')
def test_build_vocabulary_no_texts(self): vocab = build_vocabulary(2) assert "a" not in vocab assert "z" not in vocab
def test_build_vocabulary_multiple_texts(self): vocab = build_vocabulary(2, 'zabcfdegadbew', "abcdeadbe") assert "a" in vocab assert "c" in vocab assert "g" not in vocab
def test_build_vocabulary(self): vocab = build_vocabulary(2, 'zabcfdegadbew') assert "a" in vocab assert "c" not in vocab
sents_secondInaugral= gutenberg.sents("secondInaugral.txt") inputfile_Gettysburg = open("Gettysburg.txt") Gettysburg=inputfile_Gettysburg.read() inputfile_firstInaugral= open("firstInaugral.txt") firstInaugral=inputfile_firstInaugral.read() lincolnTotal = Gettysburg + firstInaugral inputfile_secondInaugral = open("secondInaugral.txt") #inputfile_secondInaugral.read() LB_Train_Corpus = sents_gettysburg + sents_firstInaugral train_words_lb = [w for s in LB_Train_Corpus for w in s] test_words_lb = [w for s in sents_secondInaugral for w in s] # Remove rare words from the corpus vocab = build_vocabulary(5, train_words_lb) LB_Train=map(lambda x: x in vocab, train_words_lb) LB_Test=map(lambda x: x in vocab, test_words_lb) bigram_counts = count_ngrams(2, vocab, LB_Train_Corpus) LB = LaplaceNgramModel(bigram_counts) #*************************************************** sents_nm_freedom = gutenberg.sents("mandelaFreedom.txt") sents_nm_prepared = gutenberg.sents("mandelaPrepared.txt") sents_nm_anc = gutenberg.sents("mandelaANC.txt") inputfile_mandelaFreedom = open("mandelaFreedom.txt") mandelaFreedom=inputfile_mandelaFreedom.read() inputfile_mandelaPrepared = open("mandelaPrepared.txt") mandelaPrepared=inputfile_mandelaPrepared.read() mandelaTotal = mandelaFreedom + mandelaPrepared
from nltk.model import build_vocabulary from nltk.model import count_ngrams from nltk.model import MLENgramModel from nltk.model import LidstoneNgramModel # load doc into memory raw = open('datasets/WW_Dataset.txt', 'r').read() print(raw[:75]) tokens = word_tokenize(raw) print(len(tokens)) lines = line_tokenize(raw) test_lines = lines[3:5] test_words = [w for s in test_lines for w in s] print(test_words[:5]) corpus = [w.lower() for w in tokens] text = nltk.Text(tokens) words = [w.lower() for w in tokens] print(words[:10]) vocab = sorted(set(words)) print(len(vocab)) spl = int(95*len(corpus)/100) train = text[:spl] test = text[spl:] vocab = build_vocabulary(2, words) bigram_counts = count_ngrams(2, vocab, text) bigram_model = LidstoneNgramModel(3,bigram_counts) #ex_score = bigram_model.score("yawned", ["he"]) print(bigram_model.perplexity("stopped and took the penny up and when the cripple nearer drew quoth andrew under halfacrown what a man finds is all his own and so my friend goodday to show and proud still in that dear vagrants looked up by reason bound as if in habit sympathy their spirit spare more oft love for thee that say stand in all works or their congenial powers that fear as pleasures round the stationary blasts of their sorrow heart those higher years that did i meditate to me and evil sweet respect to paint that musings of vice as high and words of excellence had with beatitude that pure gains and sedate"))
corpus = [word.lower() for word in corpus1.split()] # Train on 95% f the corpus and test on the rest # spl = 95*len(corpus)/100 train = corpus test = secondInaugral; # Remove rare words from the corpus fdist = nltk.FreqDist(w for w in train) vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) train = map(lambda x: x if x in vocabulary else "*unknown*", train) test = map(lambda x: x if x in vocabulary else "*unknown*", test) vocab = build_vocabulary(5, test_words1) print len(vocab) bigram_counts = count_ngrams(2, vocab, test_sents) print "count" # print sorted(bigram_counts.ngrams[2].conditions()) bigram_model = LaplaceNgramModel(bigram_counts); print bigram_model.ngram_counter == bigram_counts # lm = NgramModel(3, brown.words(categories='news'), estimator) print "perplexity(test) =", bigram_model.perplexity(test) print sents1 vocabnm = build_vocabulary(5, test_words_nm) print len(vocabnm) bigram_counts_nm = count_ngrams(2, vocabnm, test_nm_sents) print "count" # print sorted(bigram_counts.ngrams[2].conditions())
corpus = [word.lower() for word in corpus1.split()] # Train on 95% f the corpus and test on the rest # spl = 95*len(corpus)/100 train = corpus test = secondInaugral # Remove rare words from the corpus fdist = nltk.FreqDist(w for w in train) vocabulary = set( map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) train = map(lambda x: x if x in vocabulary else "*unknown*", train) test = map(lambda x: x if x in vocabulary else "*unknown*", test) vocab = build_vocabulary(5, test_words1) print len(vocab) bigram_counts = count_ngrams(2, vocab, test_sents) print "count" # print sorted(bigram_counts.ngrams[2].conditions()) bigram_model = LaplaceNgramModel(bigram_counts) print bigram_model.ngram_counter == bigram_counts # lm = NgramModel(3, brown.words(categories='news'), estimator) print "perplexity(test) =", bigram_model.perplexity(test) print sents1 vocabnm = build_vocabulary(5, test_words_nm) print len(vocabnm) bigram_counts_nm = count_ngrams(2, vocabnm, test_nm_sents) print "count" # print sorted(bigram_counts.ngrams[2].conditions())
sents_secondInaugral = gutenberg.sents("secondInaugral.txt") inputfile_Gettysburg = open("Gettysburg.txt") Gettysburg = inputfile_Gettysburg.read() inputfile_firstInaugral = open("firstInaugral.txt") firstInaugral = inputfile_firstInaugral.read() lincolnTotal = Gettysburg + firstInaugral inputfile_secondInaugral = open("secondInaugral.txt") #inputfile_secondInaugral.read() LB_Train_Corpus = sents_gettysburg + sents_firstInaugral train_words_lb = [w for s in LB_Train_Corpus for w in s] test_words_lb = [w for s in sents_secondInaugral for w in s] # Remove rare words from the corpus vocab = build_vocabulary(5, train_words_lb) LB_Train = map(lambda x: x in vocab, train_words_lb) LB_Test = map(lambda x: x in vocab, test_words_lb) bigram_counts = count_ngrams(2, vocab, LB_Train_Corpus) LB = LaplaceNgramModel(bigram_counts) #*************************************************** sents_nm_freedom = gutenberg.sents("mandelaFreedom.txt") sents_nm_prepared = gutenberg.sents("mandelaPrepared.txt") sents_nm_anc = gutenberg.sents("mandelaANC.txt") inputfile_mandelaFreedom = open("mandelaFreedom.txt") mandelaFreedom = inputfile_mandelaFreedom.read() inputfile_mandelaPrepared = open("mandelaPrepared.txt") mandelaPrepared = inputfile_mandelaPrepared.read() mandelaTotal = mandelaFreedom + mandelaPrepared