def cross_validate(result_file, root_dir, folds, read_func, sent_num): if os.path.isfile(result_file): os.system('rm ' + result_file) print 'cross-validation started' for i, fold in enumerate(folds): print 'fold %d/%d started' % (i+1, len(folds)) # read test test = read_func(root_dir, fold, sent_num, local_idx = True, full_info = True) # read train train_files = reduce(lambda a,b: a+b, folds[:i] + folds[i+1:]) train = read_func(root_dir, train_files, sent_num) # open result file fw = open(result_file, 'ab') # train and compute count = 1 for key in test.iterkeys(): lm = NgramModel(3, train[key]) for s in test[key]: if len(s[-1]) > 0: e = lm.entropy(s[-1]) # s[0] is xml id # s[1] is div index # key is global index # s[2] is turn index # s[3] is local index fw.write(','.join(map(str, (s[0], s[1], key, s[2], s[3], e))) + '\n') # print progress sys.stdout.write('\rkey = %d/%d done' % (count, len(test))) sys.stdout.flush() count += 1 print 'fold %d/%d done' % (i+1, len(folds))
def test_MLEEstimator(self): est = MLEEstimator() dapos_model = NGram(3, estimator=est) dapos_model.set_index(AuxiliaryIndex) nltk_model = NgramModel(3, self.corpus, estimator=MLEProbDist) phrase = 'Stop being stunned'.split() x = dapos_model.prob(phrase) y = nltk_model.prob(phrase[2], phrase[:2])
def train(): # parse XML and load up words print("Loading words from XML files...") sentences = [] files = glob.glob("data/*.xml") i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) break dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 words = [] for sentence in sentences: words.append(nltk.word_tokenize(sentence)) # build a trigram Language Model (using default Good-Turing # smoothing) with the words array print("Building language model...") est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) langModel = NgramModel(3, words, estimator=est) # langModel = NgramModel(3, words) # cPickle.dump(langModel, open("lm.bin", 'wb')) return langModel
def build_lm(self, corpus, order=2): """ Create a reasonable English language model on your training data. """ self._lm_order = order if order > 0: tokens = [] sentence_count = 0 for e_sent, f_sent in corpus: if sentence_count % 100 == 0: print("LM Sentence %i" % sentence_count) sentence_count += 1 # Each sentence starts with an empty string tokens += [''] + e_sent estimator = lambda fdist, bins: \ LidstoneProbDist(fdist, 0.1) self._lm = NgramModel(order, tokens, pad_left=False, pad_right=False, estimator=estimator) else: self._lm = StubLanguageModel()
def leftmodel(sequencesinfamilylist): print 'Learning left model...' model_left = NgramModel(3, sequencesinfamilylist, pad_left=False, pad_right=True, estimator=lid_estimator) print 'Done learning left model.' return model_left
def store_name_brand_attribute_features(review_data, filepath, sheetnum, colnum, storepath): lm = NgramModel(1, review_data, estimator=None) data = tp.seg_fil_excel(filepath, sheetnum, colnum) ep = entropy_perplexity(lm, data) p = open(storepath, 'w') for j in ep: p.write(str(j[0]) + '\t' + str(j[1]) + '\n') p.close()
def build_model(word_string): words = word_string.replace('\n',' ').replace('\t',' ') #split_delim = "|".join(["\%s" % s for s in string.punctuation + " "]) #words = re.split(split_delim,words) words = re.findall('[a-zA-Z]+|[%s]+' % string.punctuation, words) words = [w.strip() for w in words] est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) model = NgramModel(6, words, estimator=est) return model
def get_similarity(q, idx, tf_idf): COUNT_WORDS_IN_TEXT = 100 urls_for_sim = set() for info in tf_idf: weight = info[0] ds2_lst = info[1] for ds in ds2_lst: urls_for_sim.add(ds.text_id) rank_lm = [] est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) q_sequence = wordpunct_tokenize(q) for text_id in urls_for_sim: text = ds.text sequence = wordpunct_tokenize(text) lm = NgramModel(3, sequence, estimator=est) ds.weight = lm.entropy(q_sequence) # if len(ds.text) > COUNT_WORDS_IN_TEXT: rank_lm += [ds] return sorted(rank_lm, key=lambda ds: ds.weight)
def store_name_brand_attribute_features(review_data, filepath, sheetnum, colnum, storepath): # Building an ngram language model of a certain product category review lm = NgramModel(1, review_data, estimator=None) # Need initiallized # Read full review dataset data = tp.seg_fil_excel(filepath, sheetnum, colnum) ep = entropy_perplexity(lm, data) p = open(storepath,'w') for j in ep: p.write(str(j[0]) + '\t' + str(j[1]) + '\n') p.close()
def buildmodel(textfn, onlywholewords=False): """Takes a filename for some input text, tokenizes, and builds a bigram model.""" text = open(textfn, "r").read() words = nltk.word_tokenize(text) if onlywholewords: import string isletter = lambda c: c in string.ascii_letters words = [word for word in words if all(map(isletter, word))] model = NgramModel(2, words) return model
def __init__(self, training_prefix): l_sentences = [] f_sentences = [] c_sentences = [] trees = [] # The set of possible inflections for each lemma. self.inflections = defaultdict(set) with utf8open(training_prefix + ".lemma") as lemma_file, utf8open( training_prefix + ".form" ) as form_file, utf8open(training_prefix + ".tree") as tree_file: for lemma_line, form_line, tree_line in izip(lemma_file, form_file, tree_file): l_sentence = lemma_line.split() f_sentence = form_line.split() c_sentence = [] for lemma, form in izip(l_sentence, f_sentence): c_sentence.append("{}~{}".format(lemma, form)) self.inflections[lemma].add(form) l_sentences.append(l_sentence) f_sentences.append(f_sentence) c_sentences.append(c_sentence) trees.append(DepTree(tree_line)) self.lr_model = NgramModel(2, c_sentences, pad_left=True, estimator=lidstone_estimator) self.dp_model = DependencyNgramModel(2, l_sentences, f_sentences, trees)
def build_model(self, text): c = wordpunct_tokenize(text) m = NgramModel(1, c, self.discount) return m
class Inflector(object): """A simple inflector based on a lemma bigram model.""" def __init__(self, training_prefix): l_sentences = [] f_sentences = [] c_sentences = [] trees = [] # The set of possible inflections for each lemma. self.inflections = defaultdict(set) with utf8open(training_prefix + ".lemma") as lemma_file, utf8open( training_prefix + ".form" ) as form_file, utf8open(training_prefix + ".tree") as tree_file: for lemma_line, form_line, tree_line in izip(lemma_file, form_file, tree_file): l_sentence = lemma_line.split() f_sentence = form_line.split() c_sentence = [] for lemma, form in izip(l_sentence, f_sentence): c_sentence.append("{}~{}".format(lemma, form)) self.inflections[lemma].add(form) l_sentences.append(l_sentence) f_sentences.append(f_sentence) c_sentences.append(c_sentence) trees.append(DepTree(tree_line)) self.lr_model = NgramModel(2, c_sentences, pad_left=True, estimator=lidstone_estimator) self.dp_model = DependencyNgramModel(2, l_sentences, f_sentences, trees) def inflect(self, testing_prefix, dp_weight=0.5): """Return a list containing inflected versions of the sentences described by the files under *testing_prefix*.""" lr_weight = 1 - dp_weight inflected = [] with utf8open(testing_prefix + ".lemma") as lemma_file, utf8open(testing_prefix + ".tree") as tree_file: for lemma_line, tree_line in izip(lemma_file, tree_file): l_sentence = lemma_line.split() tree = DepTree(tree_line) ngrams = dep_ngrams(2, l_sentence, l_sentence, tree) # not used here forms = [] last_lemma = None for lemma, dep_ngram in izip(l_sentence, ngrams): if not self.inflections[lemma]: # We've never seen this lemma before, so just # output it as-is and move on. forms.append(lemma) continue best_form = None best_score = float("-inf") for form in self.inflections[lemma]: if last_lemma is None: context = [""] else: context = ["{}~{}".format(last_lemma, forms[-1])] score = lr_weight * self.lr_model.prob( "{}~{}".format(lemma, form), context ) + dp_weight * self.dp_model.prob(form, dep_ngram[:-1]) if score > best_score: best_form = form best_score = score forms.append(best_form) last_lemma = lemma inflected.append(" ".join(forms)) return inflected
from random import randint from nltk.tokenize import word_tokenize import numpy as np est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) noOfFiles = 3 fileids = ['bryant-stories.txt', 'carroll-alice.txt', 'shakespeare-hamlet.txt'] Authors = ['Bryant', 'Carroll', 'Shakespeare'] lenFirstSent = [ len(gutenberg.sents(fileids[i])[0]) - 1 for i in range(noOfFiles) ] C = [gutenberg.words(fileids[i])[lenFirstSent[i]:] for i in range(noOfFiles)] lenC = [len(C[i]) for i in range(noOfFiles)] unigram = [NgramModel(1, C[i], estimator=est) for i in range(noOfFiles)] bigram = [ NgramModel(2, C[i], True, True, estimator=est) for i in range(noOfFiles) ] trigram = [ NgramModel(3, C[i], True, True, estimator=est) for i in range(noOfFiles) ] def generateText(model, train): pos = [] for i in range(20): pos.append(train[randint(0, len(train))]) return model.generate(50, set(pos))
data = f.read() def removeNonAscii(s): return "".join(i for i in s if ord(i) < 128) data = removeNonAscii(data) tokens = nltk.word_tokenize(data) t = Text(tokens) # t.generate(30) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) trigram_model = NgramModel(3, t, estimator=estimator) token_array = trigram_model.generate(150)[10:] first_token = token_array[0].strip if first_token in [".", ",", "?", "(", ")"]: token_array = token_array[1:] joined = " ".join(token_array) joined = joined.replace(" . ", ".\n") joined = joined.replace(". ", ".\n") joined = joined.replace(" ? ", "?\n") joined = joined.replace(" , ", ",\n") joined = joined.replace(" ) ", ")\n") joined = joined.replace(" ( ", "(")
SAMPLE_SENTS_NUM = 100 for fold_idx, fold in enumerate(folds): # takes 37 min # print progress print 'fold %d/%d started' % (fold_idx + 1, len(folds)) # read test sents test_sents = read_firstN_sents(cur, fold, SAMPLE_SENTS_NUM, include_loc=True) # read the train sents train_convIDs = reduce(lambda a, b: a + b, folds[:fold_idx] + folds[fold_idx + 1:]) train_sents = read_firstN_sents(cur, train_convIDs, SAMPLE_SENTS_NUM) # open the file to write the results fw = open(res_file_name, 'ab') # train and compute key_count = 1 for key in test_sents.iterkeys(): lm = NgramModel(3, train_sents[key]) for s in test_sents[key]: e = lm.entropy(s[2]) fw.write(','.join((str(s[0]), str(key), str(s[1]), str(e))) + '\n') # print progress sys.stdout.write('\rkey = %d/%d done' % (key_count, len(test_sents))) sys.stdout.flush() key_count += 1 # print progress print '\nfold %d/%d done\n' % (fold_idx + 1, len(folds))
from nltk.corpus import brown, shakespeare from nltk.probability import LidstoneProbDist from nltk.model.ngram import NgramModel ##todo: try shakespeare corpus NGRAM_MODEL_N = 3 #TRAIN = brown.words(categories='lore') ## just a list of strings TRAIN = shakespeare.words() ESTIMATOR = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(NGRAM_MODEL_N, TRAIN, estimator=ESTIMATOR) print lm print lm.generate(40) print 'done'
from nltk.model.ngram import NgramModel import re corpus = {"bill": [], "resolution": []} for b in Bill.objects.filter(congress__gte=109): title = b.title_no_number + " ###" if title.startswith("To "): continue title = re.sub(r" \d\d\d\d", " 2015", title) title = re.sub(r"\.$", "", title) corpus[b.noun].append(title.split(" ")) # Generate a few separate models. models = { ("bill", 2): NgramModel(2, corpus["bill"]), ("bill", 3): NgramModel(3, corpus["bill"]), ("resolution", 2): NgramModel(2, corpus["resolution"]), ("resolution", 3): NgramModel(3, corpus["resolution"]), } def make_random_bill_title(bill_type): # Generate a sentence, one word at a time. sentence = [] while True: model = models[(bill_type, 2 if (len(sentence) % 2) == 0 else 3)] wd = model.choose_random_word(sentence) if wd == "###": if len(sentence) > 6: # finished
folds = tmp # remove the existing result file res_file_name = 'results_swbd_nltk_CV.txt' if os.path.isfile(res_file_name): os.system('rm ' + res_file_name) SAMPLE_SENTS_NUM = 100 for fold_idx, fold in enumerate(folds): # takes 37 min # print progress print 'fold %d/%d started' % (fold_idx+1, len(folds)) # read test sents test_sents = read_firstN_sents(cur, fold, SAMPLE_SENTS_NUM, include_loc = True) # read the train sents train_convIDs = reduce(lambda a,b: a+b, folds[:fold_idx] + folds[fold_idx+1:]) train_sents = read_firstN_sents(cur, train_convIDs, SAMPLE_SENTS_NUM) # open the file to write the results fw = open(res_file_name, 'ab') # train and compute key_count = 1 for key in test_sents.iterkeys(): lm = NgramModel(3, train_sents[key]) for s in test_sents[key]: e = lm.entropy(s[2]) fw.write(','.join((str(s[0]), str(key), str(s[1]), str(e))) + '\n') # print progress sys.stdout.write('\rkey = %d/%d done' % (key_count, len(test_sents))) sys.stdout.flush() key_count += 1 # print progress print '\nfold %d/%d done\n' % (fold_idx+1, len(folds))
from nltk.text import Text,LidstoneProbDist from nltk.model.ngram import NgramModel f = open("poem.txt") # f = open("howl.txt") data = f.read() def removeNonAscii(s): return "".join(i for i in s if ord(i)<128) data = removeNonAscii(data) tokens = nltk.word_tokenize(data) t = Text(tokens) # t.generate(30) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) trigram_model = NgramModel(3, t, estimator = estimator) token_array = trigram_model.generate(150)[10:] first_token = token_array[0].strip if first_token in [".", ",", "?", "(", ")"]: token_array = token_array[1:] joined = " ".join(token_array) joined = joined.replace(" . ", ".\n") joined = joined.replace(". ", ".\n") joined = joined.replace(" ? ", "?\n") joined = joined.replace(" , ", ",\n") joined = joined.replace(" ) ", ")\n") joined = joined.replace(" ( ", "(")
# # An n-gram is a sequence of n words; we count n-grams in a text and # calculate a conditional probability distribution like: # # $$ # P(X_i|X_{i-1},..,X_{i-n+1}) # $$ # <codecell> from nltk.model.ngram import NgramModel from nltk.probability import WittenBellProbDist, LidstoneProbDist train_words = brown.words()[:-500] test_words = brown.words()[-500:] lm = NgramModel(2, train_words, lambda fd, b: LidstoneProbDist(fd, 0.2)) # <codecell> lm.entropy(test_words) # <markdowncell> # ### Counting # # For example, how many words in a corpus are not in WordNet? # <codecell> from nltk.corpus import wordnet from nltk.probability import ConditionalFreqDist