def train(): # parse XML and load up words print("Loading words from XML files...") sentences = [] files = glob.glob("data/*.xml") i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) break dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 words = [] for sentence in sentences: words.append(nltk.word_tokenize(sentence)) # build a trigram Language Model (using default Good-Turing # smoothing) with the words array print("Building language model...") est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) langModel = NgramModel(3, words, estimator=est) # langModel = NgramModel(3, words) # cPickle.dump(langModel, open("lm.bin", 'wb')) return langModel
def cross_validate(result_file, root_dir, folds, read_func, sent_num): if os.path.isfile(result_file): os.system('rm ' + result_file) print 'cross-validation started' for i, fold in enumerate(folds): print 'fold %d/%d started' % (i+1, len(folds)) # read test test = read_func(root_dir, fold, sent_num, local_idx = True, full_info = True) # read train train_files = reduce(lambda a,b: a+b, folds[:i] + folds[i+1:]) train = read_func(root_dir, train_files, sent_num) # open result file fw = open(result_file, 'ab') # train and compute count = 1 for key in test.iterkeys(): lm = NgramModel(3, train[key]) for s in test[key]: if len(s[-1]) > 0: e = lm.entropy(s[-1]) # s[0] is xml id # s[1] is div index # key is global index # s[2] is turn index # s[3] is local index fw.write(','.join(map(str, (s[0], s[1], key, s[2], s[3], e))) + '\n') # print progress sys.stdout.write('\rkey = %d/%d done' % (count, len(test))) sys.stdout.flush() count += 1 print 'fold %d/%d done' % (i+1, len(folds))
def build_lm(self, corpus, order=2): """ Create a reasonable English language model on your training data. """ self._lm_order = order if order > 0: tokens = [] sentence_count = 0 for e_sent, f_sent in corpus: if sentence_count % 100 == 0: print("LM Sentence %i" % sentence_count) sentence_count += 1 # Each sentence starts with an empty string tokens += [''] + e_sent estimator = lambda fdist, bins: \ LidstoneProbDist(fdist, 0.1) self._lm = NgramModel(order, tokens, pad_left=False, pad_right=False, estimator=estimator) else: self._lm = StubLanguageModel()
def store_name_brand_attribute_features(review_data, filepath, sheetnum, colnum, storepath): lm = NgramModel(1, review_data, estimator=None) data = tp.seg_fil_excel(filepath, sheetnum, colnum) ep = entropy_perplexity(lm, data) p = open(storepath, 'w') for j in ep: p.write(str(j[0]) + '\t' + str(j[1]) + '\n') p.close()
def leftmodel(sequencesinfamilylist): print 'Learning left model...' model_left = NgramModel(3, sequencesinfamilylist, pad_left=False, pad_right=True, estimator=lid_estimator) print 'Done learning left model.' return model_left
def build_model(word_string): words = word_string.replace('\n',' ').replace('\t',' ') #split_delim = "|".join(["\%s" % s for s in string.punctuation + " "]) #words = re.split(split_delim,words) words = re.findall('[a-zA-Z]+|[%s]+' % string.punctuation, words) words = [w.strip() for w in words] est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) model = NgramModel(6, words, estimator=est) return model
def store_name_brand_attribute_features(review_data, filepath, sheetnum, colnum, storepath): # Building an ngram language model of a certain product category review lm = NgramModel(1, review_data, estimator=None) # Need initiallized # Read full review dataset data = tp.seg_fil_excel(filepath, sheetnum, colnum) ep = entropy_perplexity(lm, data) p = open(storepath,'w') for j in ep: p.write(str(j[0]) + '\t' + str(j[1]) + '\n') p.close()
def buildmodel(textfn, onlywholewords=False): """Takes a filename for some input text, tokenizes, and builds a bigram model.""" text = open(textfn, "r").read() words = nltk.word_tokenize(text) if onlywholewords: import string isletter = lambda c: c in string.ascii_letters words = [word for word in words if all(map(isletter, word))] model = NgramModel(2, words) return model
from random import randint from nltk.tokenize import word_tokenize import numpy as np est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) noOfFiles = 3 fileids = ['bryant-stories.txt', 'carroll-alice.txt', 'shakespeare-hamlet.txt'] Authors = ['Bryant', 'Carroll', 'Shakespeare'] lenFirstSent = [ len(gutenberg.sents(fileids[i])[0]) - 1 for i in range(noOfFiles) ] C = [gutenberg.words(fileids[i])[lenFirstSent[i]:] for i in range(noOfFiles)] lenC = [len(C[i]) for i in range(noOfFiles)] unigram = [NgramModel(1, C[i], estimator=est) for i in range(noOfFiles)] bigram = [ NgramModel(2, C[i], True, True, estimator=est) for i in range(noOfFiles) ] trigram = [ NgramModel(3, C[i], True, True, estimator=est) for i in range(noOfFiles) ] def generateText(model, train): pos = [] for i in range(20): pos.append(train[randint(0, len(train))]) return model.generate(50, set(pos))
SAMPLE_SENTS_NUM = 100 for fold_idx, fold in enumerate(folds): # takes 37 min # print progress print 'fold %d/%d started' % (fold_idx + 1, len(folds)) # read test sents test_sents = read_firstN_sents(cur, fold, SAMPLE_SENTS_NUM, include_loc=True) # read the train sents train_convIDs = reduce(lambda a, b: a + b, folds[:fold_idx] + folds[fold_idx + 1:]) train_sents = read_firstN_sents(cur, train_convIDs, SAMPLE_SENTS_NUM) # open the file to write the results fw = open(res_file_name, 'ab') # train and compute key_count = 1 for key in test_sents.iterkeys(): lm = NgramModel(3, train_sents[key]) for s in test_sents[key]: e = lm.entropy(s[2]) fw.write(','.join((str(s[0]), str(key), str(s[1]), str(e))) + '\n') # print progress sys.stdout.write('\rkey = %d/%d done' % (key_count, len(test_sents))) sys.stdout.flush() key_count += 1 # print progress print '\nfold %d/%d done\n' % (fold_idx + 1, len(folds))
from nltk.model.ngram import NgramModel import re corpus = {"bill": [], "resolution": []} for b in Bill.objects.filter(congress__gte=109): title = b.title_no_number + " ###" if title.startswith("To "): continue title = re.sub(r" \d\d\d\d", " 2015", title) title = re.sub(r"\.$", "", title) corpus[b.noun].append(title.split(" ")) # Generate a few separate models. models = { ("bill", 2): NgramModel(2, corpus["bill"]), ("bill", 3): NgramModel(3, corpus["bill"]), ("resolution", 2): NgramModel(2, corpus["resolution"]), ("resolution", 3): NgramModel(3, corpus["resolution"]), } def make_random_bill_title(bill_type): # Generate a sentence, one word at a time. sentence = [] while True: model = models[(bill_type, 2 if (len(sentence) % 2) == 0 else 3)] wd = model.choose_random_word(sentence) if wd == "###": if len(sentence) > 6: # finished
data = f.read() def removeNonAscii(s): return "".join(i for i in s if ord(i) < 128) data = removeNonAscii(data) tokens = nltk.word_tokenize(data) t = Text(tokens) # t.generate(30) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) trigram_model = NgramModel(3, t, estimator=estimator) token_array = trigram_model.generate(150)[10:] first_token = token_array[0].strip if first_token in [".", ",", "?", "(", ")"]: token_array = token_array[1:] joined = " ".join(token_array) joined = joined.replace(" . ", ".\n") joined = joined.replace(". ", ".\n") joined = joined.replace(" ? ", "?\n") joined = joined.replace(" , ", ",\n") joined = joined.replace(" ) ", ")\n") joined = joined.replace(" ( ", "(")
def build_model(self, text): c = wordpunct_tokenize(text) m = NgramModel(1, c, self.discount) return m