import cPickle as pickle import random from nltk.model import NgramModel from nltk.probability import LidstoneProbDist import nltk print "... loading text" text_train = list(nltk.corpus.gutenberg.words('austen-emma.txt')) print len(set(text_train)) text_test = list(nltk.corpus.gutenberg.words('austen-sense.txt')) #with open('./../datasets/t5_train') as f: # text_train =(' '.join(pickle.load(f))).split(' . ') # random.shuffle(text_train) # text_train = (' . '.join(text_train)).split(' ') # #with open('./../datasets/t5_test') as f: # text_test =(' '.join(pickle.load(f))).split(' . ') # random.shuffle(text_test) # text_test = (' . '.join(text_test)).split(' ') print "... training model" estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(3, text_train, estimator=estimator) print "... results" print lm.generate(50, ['dog']) print lm.perplexity(text_test) print lm.entropy(text_test)
class Ngrammodel: def __init__(self): self.sentences = [] self.ngramModel = None def loadSentences(self, corpus): print 'In Load Sentences' lines = UnicodeHelper.readlinesSingleColumn(corpus) print 'lines loaded' self.sentences = [tuple(line.split()) for line in lines] def trainNgramModel(self, n): self.ngramModel = NgramModel(n, self.sentences) def sanityCheck(self): print 'here' for sentence in self.sentences: print sentence def logprob(self, word, context): return self.ngramModel.logprob(word, context) def prob(self, word, context): return self.ngramModel.prob(word, context) def testCompletion(self, context, word): print "Prob:", self.ngramModel.prob(word, context) print "Log Prob:", self.ngramModel.logprob(word, context)
def demo_generate(text): print "len of tokens=", len(text) while True: N = raw_input("Select a number N for the N-gram model (2, 3, or 4 only):") N = int(N) if N in [2, 3, 4]: break if N == 2: bi = nltk.bigrams(text) cfd = nltk.ConditionalFreqDist(bi) else: from nltk.model import NgramModel from nltk.probability import LidstoneProbDist, WittenBellProbDist estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(N, text, estimator) # for w in lm.generate(20, context=('I')): print w, while 1: inp = raw_input('Enter a Chinese word such as "目前"(type 0 to exit):'); print "inp='"+ inp + "'" if inp == '0': break inp = inp.decode('big5') if N == 2: generate_model(cfd, inp) else: for w in lm.generate(20, context=(inp)): print w, print "\n" for w in lm.generate(20, context=(inp)): print w, print "\n"
def main(argv): AllWords = [] OutFile = argv[1] GenQty = int(argv[2]) print "Will try to generate " + str(GenQty) + " sentences!" sentLen = dict() lineQty = 0 for line in sys.stdin: line = re.sub(r"</?s>", "", line) line = line.rstrip("\n") elems = re.split("\s+", line) AllWords.extend(elems) lineQty = lineQty + 1 slen = len(elems) if not slen in sentLen: sentLen[slen] = 0 sentLen[slen] = sentLen[slen] + 1 print (sentLen) print str(len(AllWords)) + "\n" Estim = lambda fdist, bins: GoodTuringProbDist(fdist) # Estim = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) N = 3 print "Words are read, now let's compute the " + str(N) + "-gram model.\n" model = NgramModel(N, AllWords, estimator=Estim) print "" + str(N) + "-gram model is computed.\n" outf = open(OutFile, "w") for i in range(1, GenQty + 1): RandSum = random.randint(1, lineQty) sum = 0 RandLen = -1 for k in sentLen.keys(): sum = sum + sentLen[k] if sum >= RandSum: RandLen = k break if RandLen == -1: print ("Internal error! Cannot select len for sent: " + str(i)) sys.exit(1) text_words = model.generate(RandLen) # Concatenate all words generated in a string separating them by a space. text = " ".join([word for word in text_words]) # Sometimes, we got more than one space, have no idea why text = re.sub(r"\s+", " ", "<s> " + text + " </s>") outf.write(text + "\n")
def generate(self, length=100): """ Print random text, generated using a trigram language model. :param length: The length of text to generate (default=100) :type length: int :seealso: NgramModel """ if '_trigram_model' not in self.__dict__: print("Building ngram index...") estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._trigram_model = NgramModel(3, self, estimator=estimator) text = self._trigram_model.generate(length) print(tokenwrap(text))
def calcWordProb(self): word_seq = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz'] text = "They become more expensive already. Mine is like 25. So horrible and they did less things than I did last time." text = nltk.word_tokenize(text.translate(None, ',.')) print text #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) #lm = NgramModel(2, word_seq, estimator) est = lambda freqdist, bins: LidstoneProbDist(freqdist, 0.2, bins) model = NgramModel(3, text, True, True, est, 21) print model.prob("more", text)
def generateNgramModel(corpusPath, corpusName): corpusdir = 'corpora/' # Directory of corpus. generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False, estimator) #uses bigrams just cause they BETTER return ngrammodel
def hammertime(corpus, ngramss=0, numGen=100): tokens = list(word_tokenize(corpus)) print tokens[0:900] # estimator for smoothing the N-gram model estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) if ngramss <= 0: ngramss = random.randint(2, 4) model = NgramModel(ngramss, tokens, False, False, estimator) # Apply the language model to generate 50 words in sequence text_words = model.generate(numGen) # Concatenate all words generated in a string separating them by a space. text = ' '.join([word for word in text_words]) return text
def train_model(fdist, listObj, n): """ @n - size of ngram @fdist - frequency distribution @listObj - ngram data list """ estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(n, fdist, estimator=estimator) return lm
def __init__(self): cess_sents = cess.tagged_sents() self.uni_tag = ut(cess_sents) self.model = NgramModel(3, brown.words()) self.translation = [] self.dictionary = collections.defaultdict(lambda: 0) dictionaryFile = open("../corpus/Dictionary.txt", 'r') for translation in dictionaryFile: spanish, english = translation.split(" - ") spanish = spanish.decode('utf-8') self.dictionary[spanish] = collections.defaultdict(lambda: []) english = english.rstrip(';\n').split('; ') for pos in english: pos = pos.split(': ') self.dictionary[spanish][pos[0]] = pos[1].split(', ') self.sentences = [] sentencesFile = open("../corpus/TestSet.txt", 'r') for sentence in sentencesFile: self.sentences.append(sentence.rstrip('\n'))
def generate(self, length=100, context=()): """ Return random text, generated using a trigram language model. :param length: The length of text to generate (default=100) :type length: int :seealso: NgramModel """ if '_trigram_model' not in self.__dict__: print "Building ngram index..." estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._trigram_model = NgramModel(3, self, estimator=estimator) text = self._trigram_model.generate(length, context=context) return tokenwrap(text)
def train(self, corpus): """Trains a language using a trigram model with stupid backoff to a bigram model with stupid backoff to a unigram model with plus one smoothing""" for sentence in corpus.corpus: for i in xrange(0, len(sentence.data)): token = sentence.data[i].word self.unigramCounts[token] += 1 self.total += 1 if i + 1 < len(sentence.data): next = sentence.data[i+1].word self.bigramCounts[(token, next)] += 1 if i + 2 < len(sentence.data): third = sentence.data[i+2].word self.trigramCounts[(token, next, third)] += 1 train_tokens = brown.words() estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self.trilm = NgramModel(3, train_tokens, True, False, estimator) self.bilm = NgramModel(2, train_tokens, True, False, estimator) self.unilm = NgramModel(1, train_tokens, True, False, estimator)
class CorpusText(nltk.Text): def concordance(self, word, width=79, lines=25): """ Return a string concordance for ``word`` with the specified context window. Word matching is not case-sensitive. :seealso: ``ConcordanceIndex`` (nltk default is to print concordance) """ if '_concordance_index' not in self.__dict__: print "Building concordance index..." self._concordance_index = CorpusConcordanceIndex(self.tokens, key=lambda s:s.lower()) # return self._concordance_index.get_concordance_as_str(word, width, lines) return self._concordance_index.get_concordance_as_matrix(word, width, lines) def similar(self, word, num=20): """ Returns as a string similar words """ if '_word_context_index' not in self.__dict__: print 'Building word-context index...' self._word_context_index = nltk.ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = fd.keys()[:num] return tokenwrap(words) else: print "No matches" def generate(self, length=100, context=()): """ Return random text, generated using a trigram language model. :param length: The length of text to generate (default=100) :type length: int :seealso: NgramModel """ if '_trigram_model' not in self.__dict__: print "Building ngram index..." estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._trigram_model = NgramModel(3, self, estimator=estimator) text = self._trigram_model.generate(length, context=context) return tokenwrap(text) def get_adjacent_tokens(self, word, window=5, lines=25): ### todo: should this go here??? look into fixing nltk.ContentIndex ## to-do: figure out what to do about capitalization ## assert word == word.lower() result = [] indices = [ i for i, w in enumerate(self.tokens) if w.lower()==word] if indices: lines = min(lines, len(indices)) print "Displaying %s of %s matches:" % (lines, len(indices)) for i in indices: if lines <= 0: break ind_a = max(0, i-window) ind_b = min(len(self.tokens), i+window) adjacent = self.tokens[ind_a:ind_b] result.append(adjacent) lines -= 1 else: print "No matches" return result
class LanguageModel: STUPID_K = 0.4 def __init__(self, corpus): """Initialize your data structures in the constructor.""" self.trigramCounts = collections.defaultdict(lambda:0) self.bigramCounts = collections.defaultdict(lambda: 0) self.unigramCounts = collections.defaultdict(lambda: 0) self.total = 0 self.trilm = None self.bilm = None self.unilm = None self.train(corpus) def train(self, corpus): """Trains a language using a trigram model with stupid backoff to a bigram model with stupid backoff to a unigram model with plus one smoothing""" for sentence in corpus.corpus: for i in xrange(0, len(sentence.data)): token = sentence.data[i].word self.unigramCounts[token] += 1 self.total += 1 if i + 1 < len(sentence.data): next = sentence.data[i+1].word self.bigramCounts[(token, next)] += 1 if i + 2 < len(sentence.data): third = sentence.data[i+2].word self.trigramCounts[(token, next, third)] += 1 train_tokens = brown.words() estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self.trilm = NgramModel(3, train_tokens, True, False, estimator) self.bilm = NgramModel(2, train_tokens, True, False, estimator) self.unilm = NgramModel(1, train_tokens, True, False, estimator) def score(self, sentence): score = 0.0 for i in xrange(2, len(sentence)): token = sentence[i] prev = sentence[i-1] first = sentence[i-2] tricount = self.trigramCounts[(first, prev, token)] #begin with trigram model if tricount > 0: score += self.trilm.prob(token, [prev, first]) # score += math.log(tricount) # score -= math.log(self.bigramCounts[(first, prev)]) score -= self.bilm.prob(first,[prev]) # continue #back off to bigram model biCount = self.bigramCounts[(prev, token)] if biCount > 0: # score += math.log(biCount) score += self.bilm.prob(token, [prev]) score += math.log(self.STUPID_K) # score -= math.log(self.unigramCounts[prev]) score -= self.unilm.prob(prev, []) # continue #back off to unigram model with +1 smoothing # count = self.unigramCounts[token] score += math.log(2 * self.STUPID_K) score += self.unilm.prob(token, []) # score += math.log(count + 1.0) # score -= math.log(self.total + len(self.unigramCounts)) return score def n_most_likely(self, sentences, n): """Given a list of string sentences, returns the n most likely""" #m = (float("-inf"),"") scores = [] for s in sentences: prob = self.score(s) scores.append((s, prob)) scores = sorted(scores, key=itemgetter(1,0), reverse=True) sents = [] for tup in scores[:n]: sents.append(tup[0]) return sents
if line not in useful: useful.append(line) print "\ntotal useful sents: " + str(len(useful)) #train trigram model corpus_tokens = [] print "Adding brown" for word in brown.words(): word = word.lower() corpus_tokens.append(word) print "Adding gutenberg" for word in gutenberg.words(): word = word.lower() corpus_tokens.append(word) print "Training Trigram Model" lm = NgramModel(3,corpus_tokens,True,False,lambda f,b:LidstoneProbDist(f,0.01,f.B()+1)) tweet_entropies = [] count = 1 for sent in useful: sent = sent.split() percentage = 100*count/len(useful) print "\rChecking entropy : " + str(count) + " of " + str(len(useful)) + " " + str(percentage) + "%", entropy = lm.entropy(sent) tweet_entropies.append((" ".join(sent), entropy)) count += 1 tweet_entropies.sort(key=lambda x: x[1]) threshold = int(len(tweet_entropies) * 0.8) list_of_tweets = tweet_entropies[:threshold] print "\n",
#m = NgramModel(1, [str(i) for i in [1,2,3,4,5]]) #print m.prob('1', []) # Tokens contains the words for Genesis and Reuters Trade tokens = set(brown.words()) words = []; for word in tokens: words.extend([char for char in word.lower()]); words.extend(['\t']); #tokens = list(genesis.words('english-kjv.txt')) #tokens.extend(list(reuters.words(categories = 'trade'))) # estimator for smoothing the N-gram model estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.02) # N-gram language model with 3-grams #model = NgramModel(3, tokens, estimator) model = NgramModel(3, words, estimator) # Apply the language model to generate 50 words in sequence text_words = model.generate(50) # Concatenate all words generated in a string separating them by a space. text = ' '.join([word for word in text_words]) # print the text print text print model.prob('e', ['a', 't'])
# Tokens contains the words for Genesis and Reuters Trade tokens = tokenize_file("simple_wikipedia_plaintext.txt") #tokens = brown.words(categories='news') #print tokens[1:100] #tokens = list(genesis.words('english-kjv.txt')) #tokens.extend(list(reuters.words(categories = 'trade'))) #tokens.extend(list(brown.words(categories='news'))) #tokens.extend(list(reuters.words(categories = 'earn'))) # estimator for smoothing the N-gram model est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) # N-gram language model with 3-grams model = NgramModel(N, tokens, pad_left=True, pad_right=True, estimator=est) #model = NgramModel(N, tokens, estimator=est) # Apply the language model to generate 50 words in sequence #text_words = model.generate(50) # Concatenate all words generated in a string separating them by a space. #text = ' '.join([word for word in text_words]) # print the text #print text sentence = "This is a sample sentence." print sentence print "p:", sentence_probability(sentence, model) print "p_m:", sentence_probability_modified(sentence, model)
class Text(object): """ A wrapper around a sequence of simple (string) tokens, which is intended to support initial exploration of texts (via the interactive console). Its methods perform a variety of analyses on the text's contexts (e.g., counting, concordancing, collocation discovery), and display the results. If you wish to write a program which makes use of these analyses, then you should bypass the ``Text`` class, and use the appropriate analysis function or class directly instead. A ``Text`` is typically initialized from a given document or corpus. E.g.: >>> import nltk.corpus >>> from nltk.text import Text >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt')) """ # This defeats lazy loading, but makes things faster. This # *shouldn't* be necessary because the corpus view *should* be # doing intelligent caching, but without this it's running slow. # Look into whether the caching is working correctly. _COPY_TOKENS = True def __init__(self, tokens, name=None): """ Create a Text object. :param tokens: The source text. :type tokens: sequence of str """ if self._COPY_TOKENS: tokens = list(tokens) self.tokens = tokens if name: self.name = name elif ']' in tokens[:20]: end = tokens[:20].index(']') self.name = " ".join(text_type(tok) for tok in tokens[1:end]) else: self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..." #//////////////////////////////////////////////////////////// # Support item & slice access #//////////////////////////////////////////////////////////// def __getitem__(self, i): if isinstance(i, slice): return self.tokens[i.start:i.stop] else: return self.tokens[i] def __len__(self): return len(self.tokens) #//////////////////////////////////////////////////////////// # Interactive console methods #//////////////////////////////////////////////////////////// def concordance(self, word, width=79, lines=25): """ Print a concordance for ``word`` with the specified context window. Word matching is not case-sensitive. :seealso: ``ConcordanceIndex`` """ if '_concordance_index' not in self.__dict__: print("Building index...") self._concordance_index = ConcordanceIndex(self.tokens, key=lambda s:s.lower()) self._concordance_index.print_concordance(word, width, lines) def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; ")) def count(self, word): """ Count the number of times this word appears in the text. """ return self.tokens.count(word) def index(self, word): """ Find the index of the first occurrence of the word in the text. """ return self.tokens.index(word) def readability(self, method): # code from nltk_contrib.readability raise NotImplementedError def generate(self, length=100): """ Print random text, generated using a trigram language model. :param length: The length of text to generate (default=100) :type length: int :seealso: NgramModel """ if '_trigram_model' not in self.__dict__: print("Building ngram index...") estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._trigram_model = NgramModel(3, self, estimator=estimator) text = self._trigram_model.generate(length) print(tokenwrap(text)) def similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() """ if '_word_context_index' not in self.__dict__: print('Building word-context index...') self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = islice(fd.keys(), num) print(tokenwrap(words)) else: print("No matches") def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if '_word_context_index' not in self.__dict__: print('Building word-context index...') self._word_context_index = ContextIndex(self.tokens, key=lambda s:s.lower()) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print("No common contexts were found") else: ranked_contexts = islice(fd.keys(), num) print(tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts)) except ValueError as e: print(e) def dispersion_plot(self, words): """ Produce a plot showing the distribution of the words through the text. Requires pylab to be installed. :param words: The words to be plotted :type word: str :seealso: nltk.draw.dispersion_plot() """ from nltk.draw import dispersion_plot dispersion_plot(self, words) def plot(self, *args): """ See documentation for FreqDist.plot() :seealso: nltk.prob.FreqDist.plot() """ self.vocab().plot(*args) def vocab(self): """ :seealso: nltk.prob.FreqDist """ if "_vocab" not in self.__dict__: print("Building vocabulary index...") self._vocab = FreqDist(self) return self._vocab def findall(self, regexp): """ Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> print('hack'); from nltk.book import text1, text5, text9 hack... >>> text5.findall("<.*><.*><bro>") you rule bro; telling you bro; u twizted bro >>> text1.findall("<a>(<.*>)<man>") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("<th.*>{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str """ if "_token_searcher" not in self.__dict__: self._token_searcher = TokenSearcher(self) hits = self._token_searcher.findall(regexp) hits = [' '.join(h) for h in hits] print(tokenwrap(hits, "; ")) #//////////////////////////////////////////////////////////// # Helper Methods #//////////////////////////////////////////////////////////// _CONTEXT_RE = re.compile('\w+|[\.\!\?]') def _context(self, tokens, i): """ One left & one right token, both case-normalized. Skip over non-sentence-final punctuation. Used by the ``ContextIndex`` that is created for ``similar()`` and ``common_contexts()``. """ # Left context j = i-1 while j>=0 and not self._CONTEXT_RE.match(tokens[j]): j -= 1 left = (tokens[j] if j != 0 else '*START*') # Right context j = i+1 while j<len(tokens) and not self._CONTEXT_RE.match(tokens[j]): j += 1 right = (tokens[j] if j != len(tokens) else '*END*') return (left, right) #//////////////////////////////////////////////////////////// # String Display #//////////////////////////////////////////////////////////// def __str__(self): return '<Text: %s>' % self.name def __repr__(self): return '<Text: %s>' % self.name
def trainNgramModel(self, n): self.ngramModel = NgramModel(n, self.sentences)
# Tokens contains the words for Genesis and Reuters Trade #tokens = list(genesis.words('english-kjv.txt')) #tokens.extend(list(reuters.words(categories = 'trade'))) # estimator for smoothing the N-gram model estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) sent = "abraham lincoln be bear feb 12 1809" tokens = sent.split() splitNgrams = list(ingrams(list(sent), 3)) tokens = ["".join(x) for x in splitNgrams] # N-gram language model with 3-grams # Without an estimator, it assumes Good-Turing. model = NgramModel(3, tokens, estimator) print "Model: " + str(model) sent2 = "abe lincoln was born in 1809" splitNgrams2 = list(ingrams(list(sent2), 3)) tokens2 = ["".join(x) for x in splitNgrams2] print "Word: " + tokens2[-1] context = " ".join(tokens2[:-1]) print "Context: " + context print model.prob(tokens2[-1], [sent2])
class Text(object): """ A wrapper around a sequence of simple (string) tokens, which is intended to support initial exploration of texts (via the interactive console). Its methods perform a variety of analyses on the text's contexts (e.g., counting, concordancing, collocation discovery), and display the results. If you wish to write a program which makes use of these analyses, then you should bypass the ``Text`` class, and use the appropriate analysis function or class directly instead. A ``Text`` is typically initialized from a given document or corpus. E.g.: >>> import nltk.corpus >>> from nltk.text import Text >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt')) """ # This defeats lazy loading, but makes things faster. This # *shouldn't* be necessary because the corpus view *should* be # doing intelligent caching, but without this it's running slow. # Look into whether the caching is working correctly. _COPY_TOKENS = True def __init__(self, tokens, name=None): """ Create a Text object. :param tokens: The source text. :type tokens: sequence of str """ if self._COPY_TOKENS: tokens = list(tokens) self.tokens = tokens if name: self.name = name elif ']' in tokens[:20]: end = tokens[:20].index(']') self.name = " ".join(text_type(tok) for tok in tokens[1:end]) else: self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..." #//////////////////////////////////////////////////////////// # Support item & slice access #//////////////////////////////////////////////////////////// def __getitem__(self, i): if isinstance(i, slice): return self.tokens[i.start:i.stop] else: return self.tokens[i] def __len__(self): return len(self.tokens) #//////////////////////////////////////////////////////////// # Interactive console methods #//////////////////////////////////////////////////////////// def concordance(self, word, width=79, lines=25): """ Print a concordance for ``word`` with the specified context window. Word matching is not case-sensitive. :seealso: ``ConcordanceIndex`` """ if '_concordance_index' not in self.__dict__: print("Building index...") self._concordance_index = ConcordanceIndex(self.tokens, key=lambda s: s.lower()) self._concordance_index.print_concordance(word, width, lines) def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words( self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter( lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; ")) def count(self, word): """ Count the number of times this word appears in the text. """ return self.tokens.count(word) def index(self, word): """ Find the index of the first occurrence of the word in the text. """ return self.tokens.index(word) def readability(self, method): # code from nltk_contrib.readability raise NotImplementedError def generate(self, length=100): """ Print random text, generated using a trigram language model. :param length: The length of text to generate (default=100) :type length: int :seealso: NgramModel """ if '_trigram_model' not in self.__dict__: print("Building ngram index...") estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._trigram_model = NgramModel(3, self, estimator=estimator) text = self._trigram_model.generate(length) print(tokenwrap(text)) def similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() """ if '_word_context_index' not in self.__dict__: print('Building word-context index...') self._word_context_index = ContextIndex( self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = islice(fd.keys(), num) print(tokenwrap(words)) else: print("No matches") def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if '_word_context_index' not in self.__dict__: print('Building word-context index...') self._word_context_index = ContextIndex(self.tokens, key=lambda s: s.lower()) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print("No common contexts were found") else: ranked_contexts = islice(fd.keys(), num) print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts)) except ValueError as e: print(e) def dispersion_plot(self, words): """ Produce a plot showing the distribution of the words through the text. Requires pylab to be installed. :param words: The words to be plotted :type word: str :seealso: nltk.draw.dispersion_plot() """ from nltk.draw import dispersion_plot dispersion_plot(self, words) def plot(self, *args): """ See documentation for FreqDist.plot() :seealso: nltk.prob.FreqDist.plot() """ self.vocab().plot(*args) def vocab(self): """ :seealso: nltk.prob.FreqDist """ if "_vocab" not in self.__dict__: print("Building vocabulary index...") self._vocab = FreqDist(self) return self._vocab def findall(self, regexp): """ Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> print('hack'); from nltk.book import text1, text5, text9 hack... >>> text5.findall("<.*><.*><bro>") you rule bro; telling you bro; u twizted bro >>> text1.findall("<a>(<.*>)<man>") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("<th.*>{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str """ if "_token_searcher" not in self.__dict__: self._token_searcher = TokenSearcher(self) hits = self._token_searcher.findall(regexp) hits = [' '.join(h) for h in hits] print(tokenwrap(hits, "; ")) #//////////////////////////////////////////////////////////// # Helper Methods #//////////////////////////////////////////////////////////// _CONTEXT_RE = re.compile('\w+|[\.\!\?]') def _context(self, tokens, i): """ One left & one right token, both case-normalized. Skip over non-sentence-final punctuation. Used by the ``ContextIndex`` that is created for ``similar()`` and ``common_contexts()``. """ # Left context j = i - 1 while j >= 0 and not self._CONTEXT_RE.match(tokens[j]): j -= 1 left = (tokens[j] if j != 0 else '*START*') # Right context j = i + 1 while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]): j += 1 right = (tokens[j] if j != len(tokens) else '*END*') return (left, right) #//////////////////////////////////////////////////////////// # String Display #//////////////////////////////////////////////////////////// def __str__(self): return '<Text: %s>' % self.name def __repr__(self): return '<Text: %s>' % self.name
def process_plaintext(dir_path): reader = CategorizedPlaintextCorpusReader(dir_path, r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt') facilitator_files = reader.fileids(categories='facilitator') participant_files = reader.fileids(categories='participant') print facilitator_files, participant_files #print reader.categories() #print len(reader.words()) #print len(reader.sents()) fac_words = [word for word in reader.words(facilitator_files)] par_words = [word for word in reader.words(participant_files)] fac_words = edit_tokens(fac_words) par_words = edit_tokens(par_words) speakers = ( [(word, 'facilitator') for word in reader.words(facilitator_files)] + [(word, 'participant') for word in reader.words(participant_files)] ) features = get_features(speakers) size = int(len(features) * 0.3) nb_train = features[size:] nb_test = features[:size] classifier = nltk.NaiveBayesClassifier.train(nb_train) print "Classifier labels:", classifier.labels() print classifier.show_most_informative_features() print "Clasify test:", nltk.classify.accuracy(classifier, nb_test) #print classifier.classify(get_features(["Yolo", "bag", "sp"], False)) #random.shuffle(speakers) three_quarters = int(len(speakers) * 0.75) train = speakers[:three_quarters] test = speakers[three_quarters:] est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist) un_lm = NgramModel(1, train, estimator=est) bi_lm = NgramModel(2, train, estimator=est) tr_lm = NgramModel(3, train, estimator=est) qu_lm = NgramModel(4, train, estimator=est) pe_lm = NgramModel(5, train, estimator=est) print un_lm print bi_lm print tr_lm print qu_lm print pe_lm print "1 gram Perplexity:", un_lm.perplexity(test) print "2 gram Perplexity:", bi_lm.perplexity(test) print "3 gram Perplexity:", tr_lm.perplexity(test) print "4 gram Perplexity:", qu_lm.perplexity(test) print "5 gram Perplexity:", pe_lm.perplexity(test) print bi_lm.generate(10, ["uh", "sp"]) fd_fac = nltk.FreqDist(fac_words) vocab_fac = fd_fac.keys() fd_par = nltk.FreqDist(par_words) vocab_par = fd_par.keys() print "Fac Vocab: " , len(vocab_fac) print "Fac Tokens: " , len(fac_words) print vocab_fac[:20] print "Par Vocab: " , len(vocab_par) print "Par Tokens: " , len(par_words) print vocab_par[:20] fd_par.plot(50)
class MachineTranslation: PUNCTUATION = [',', '.', '(', ')', '?'] ENG_ADJECTIVE = ['JJ', 'JJR', 'JJS'] ENG_NOUN = ['NN', 'NNS', 'NNP', 'NNPS'] ENG_VERB = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] ESP_ADJECTIVE = ['a', 'q', 'o', '0', 'c', 's', 'f', 'p', 'n'] ESP_NOUN = ['n'] ESP_VERB = ['vm', 'vs'] ESP_VERB_PAST = ['vmii', 'vmis', 'vsii', 'vsis'] NUMBER_PAT = "\d+" OPEN_QUESTION_MARK = '\xc2\xbf' def __init__(self): cess_sents = cess.tagged_sents() self.uni_tag = ut(cess_sents) self.model = NgramModel(3, brown.words()) self.translation = [] self.dictionary = collections.defaultdict(lambda: 0) dictionaryFile = open("../corpus/Dictionary.txt", 'r') for translation in dictionaryFile: spanish, english = translation.split(" - ") spanish = spanish.decode('utf-8') self.dictionary[spanish] = collections.defaultdict(lambda: []) english = english.rstrip(';\n').split('; ') for pos in english: pos = pos.split(': ') self.dictionary[spanish][pos[0]] = pos[1].split(', ') self.sentences = [] sentencesFile = open("../corpus/TestSet.txt", 'r') for sentence in sentencesFile: self.sentences.append(sentence.rstrip('\n')) def translate(self): for sentence in self.sentences: sentenceTranslation = [] questionSwapped = sentence if sentence.startswith(self.OPEN_QUESTION_MARK): questionSwapped = self.questionSwap(sentence) negationSwapped = self.negationSwap(questionSwapped) tokens = nltk.word_tokenize(negationSwapped) pos = self.uni_tag.tag(tokens) for word in pos: candidate = word[0].decode('utf-8').lower() # print candidate if candidate in self.PUNCTUATION or re.search(self.NUMBER_PAT, candidate): wordTranslation = candidate elif (word[1] and any(word[1].startswith(adj) for adj in self.ESP_ADJECTIVE) and 'adjective' in self.dictionary[candidate]): wordTranslation = self.dictionary[candidate]['adjective'][0] elif (word[1] and any(word[1].startswith(noun) for noun in self.ESP_NOUN) and 'noun' in self.dictionary[candidate]): wordTranslation = self.dictionary[candidate]['noun'][0] if word[1][1] == 'p': # proper noun wordTranslation = wordTranslation.capitalize() elif (word[1] and any(word[1].startswith(verb) for verb in self.ESP_VERB) and 'verb' in self.dictionary[candidate]): #wordTranslation = self.verbConjugation(candidate, word) wordTranslation = self.pluralADJ(candidate) else: wordTranslation = self.pluralADJ(candidate) sentenceTranslation.append(wordTranslation) directTranslation = " ".join(map(str, sentenceTranslation)) adjNounSwapped = self.adjNounSwap(directTranslation) lm = self.ngram(adjNounSwapped) nounSwapped = self.nounSwap(lm) pronounAdded = self.addPronoun(nounSwapped) possessives = self.possessive(pronounAdded) removedDeterminers = self.removeDeterminers(possessives) capAndNum = self.capitalizationAndNumbers(removedDeterminers) removeExtraSpace = re.sub(r' \'s', '\'s', capAndNum) removeExtraSpace = re.sub(r' ,', ',', removeExtraSpace) if removeExtraSpace[-2:] == " .": removeExtraSpace = removeExtraSpace[:-2] + "." elif removeExtraSpace[-2:] == " ?": removeExtraSpace = removeExtraSpace[:-2] + "?" self.translation.append(removeExtraSpace) # if question is a yes or no question, swap the order of first two words def questionSwap(self, sentence): sentence = sentence.lstrip(self.OPEN_QUESTION_MARK) #tokens = nltk.word_tokenize(sentence) #pos = self.uni_tag.tag(tokens) #return " ".join(map(str, tokens)) return sentence # reverse the order of negation words and their objects def negationSwap(self, sentence): tokens = nltk.word_tokenize(sentence) pos = self.uni_tag.tag(tokens) firstWord = pos[0] for i, word in enumerate(pos[1:]): if firstWord[0].lower() == "no" and word[1] is not None and (word[1].startswith('vs') or word[1].startswith('vm')): tokens[i] = tokens[i+1] tokens[i+1] = "not" firstWord = word firstWord = pos[0] secondWord = pos[1] for i, word in enumerate(pos[2:]): if firstWord[0].lower() == "no" and secondWord[1] is not None and secondWord[1].startswith('pp'): if word[1] is not None and (word[1].startswith('vs') or word[1].startswith('vm')): temp = tokens[i] tokens[i] = tokens[i+1] tokens[i+1] = "do " + temp firstWord = secondWord secondWord = word return " ".join(map(str, tokens)) # switch position of possessive words to use apostrophe notation def possessive(self, sentence): tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) removeOf = [] firstWord = pos[0] secondWord = pos[1] for i, word in enumerate(pos[2:]): if firstWord[1] in self.ENG_NOUN and secondWord[0]=='of' and word[1] in ['NNP', 'NNPS']: temp = tokens[i] tokens[i] = tokens[i+2] + "'s" tokens[i+2] = temp removeOf.append(i+1) firstWord = secondWord secondWord = word if len(removeOf) != 0: for i in reversed(removeOf): tokens.pop(i) return " ".join(map(str, tokens)) # fixes the "number of telephone" to "telephone number" example def nounSwap(self, sentence): tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) removeOf = [] firstWord = pos[0] secondWord = pos[1] for i, word in enumerate(pos[2:]): if firstWord[1] in ['NN', 'NNS'] and secondWord[0]=='of' and word[1] in ['NN', 'NNS']: temp = tokens[i] tokens[i] = tokens[i+2] tokens[i+2] = temp removeOf.append(i+1) firstWord = secondWord secondWord = word if len(removeOf) != 0: for i in reversed(removeOf): tokens.pop(i) return " ".join(map(str, tokens)) def ngram(self, sentence): words = ['your', 'its', 'his', 'her', 'their'] highestProb = 0 highestSentence = sentence for word in words: candidateSentence = re.sub('your', word, sentence) prob = self.model.prob(word, [candidateSentence]) if prob > highestProb: highestProb = prob highestSentence = candidateSentence return highestSentence # reverses order of adjacent adjectives and nouns def adjNounSwap(self, sentence): tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) firstWord = pos[0] for i, word in enumerate(pos[1:]): if firstWord[1] in self.ENG_NOUN and word[1] in self.ENG_ADJECTIVE: temp = tokens[i] tokens[i] = tokens[i+1] tokens[i+1] = temp firstWord = word return " ".join(map(str, tokens)) def addPronoun(self, sentence): tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) firstWord = pos[0] for i, word in enumerate(pos[1:]): if firstWord[1] not in self.ENG_NOUN and firstWord[0] != 'have' and firstWord[1] not in ['DT', 'TO', 'WP', 'RB', 'PRP', 'VBZ', '.', ','] and (word[1] in self.ENG_VERB or word[0]=='have'): if firstWord[1] == 'VBP' or (wordnet.synsets(word[0]) and not word[0].endswith('s')): tokens[i+1] = "they " + tokens[i+1] else: tokens[i+1] = "it " + tokens[i+1] firstWord = word if pos[0][1] in self.ENG_VERB or pos[0][0]=='have': if pos[0][1] == 'VBP' or (wordnet.synsets(word[0]) and not word[0].endswith('s')): tokens[0] = "They " + tokens[0] else: tokens[0] = "It " + tokens[0] return " ".join(map(str, tokens)) def pluralADJ(self, token): translation = self.dictionary[token]['default'][0] pos = self.uni_tag.tag(nltk.word_tokenize(token)) if pos[0][1] is not None and pos[0][1].startswith('a') and 'p' in pos[0][1]: if translation.endswith('s'): if wordnet.synsets(translation[:-1]): translation = translation[:-1] return translation def removeDeterminers(self, sentence): tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) removeOf = [] firstWord = pos[0] for i, word in enumerate(pos[1:]): if firstWord[1] in ['DT'] and word[1] in ['NNP', 'NNPS', 'NNS']: removeOf.append(i) firstWord = word if len(removeOf) != 0: for i in reversed(removeOf): tokens.pop(i) return " ".join(map(str, tokens)) def capitalizationAndNumbers(self, sentence): tokens = nltk.word_tokenize(sentence) tokens[0] = tokens[0].capitalize() pos = nltk.pos_tag(tokens) days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] for i, word in enumerate(pos): if word[1] in ['NNP', 'NNPS']: tokens[i] = tokens[i].capitalize() if word[1] in ['CD']: tokens[i] = re.sub(r'\.', ',', tokens[i]) if word[0] in days or word[0] in months: tokens[i] = tokens[i].capitalize() newTokens = [] for i, token in enumerate(tokens): if token.lower() == 'a' and i+1<len(tokens): if any(tokens[i+1].startswith(vp) for vp in ['a', 'e', 'i', 'o', 'u']): if i==0: newTokens.append('An') else: newTokens.append('an') else: newTokens.append(token) else: newTokens.append(token) return " ".join(map(str, newTokens)) def verbConjugation(self, candidate, word): wordTranslation = en.verb.present(self.dictionary[candidate]['verb'][0], person=word[1][4]) if word[1][2] == 'p': wordTranslation = en.verb.present_participle(self.dictionary[candidate]['verb'][0]) if word[1][3] == 's': wordTranslation = en.verb.past(self.dictionary[candidate]['verb'][0], person=word[1][4]) if word[1][2] == 'p': wordTranslation = en.verb.past_participle(self.dictionary[candidate]['verb'][0]) return wordTranslation
import nltk print("... build") brown = nltk.corpus.brown corpus = [word.lower() for word in brown.words()] # Train on 95% f the corpus and test on the rest spl = 95*len(corpus)/100 train = corpus[:spl] test = corpus[spl:] # Remove rare words from the corpus fdist = nltk.FreqDist(w for w in train) vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) train = map(lambda x: x if x in vocabulary else "*unknown*", train) test = map(lambda x: x if x in vocabulary else "*unknown*", test) print("... train") from nltk.model import NgramModel from nltk.probability import LidstoneProbDist estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(5, train, estimator=estimator) print("len(corpus) = %s, len(vocabulary) = %s, len(train) = %s, len(test) = %s" % (len(corpus), len(vocabulary), len(train), len(test))) print("perplexity(test) =", lm.perplexity(test))
def process_plaintext(dir_path): reader = CategorizedPlaintextCorpusReader(dir_path, r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt') facilitator_files = reader.fileids(categories='facilitator') participant_files = reader.fileids(categories='participant') print facilitator_files, participant_files #print reader.categories() #print len(reader.words()) #print len(reader.sents()) fac_words = [word for word in reader.words(facilitator_files)] par_words = [word for word in reader.words(participant_files)] fac_words = edit_tokens(fac_words) par_words = edit_tokens(par_words) speakers = ([(word, 'facilitator') for word in reader.words(facilitator_files)] + [(word, 'participant') for word in reader.words(participant_files)]) features = get_features(speakers) size = int(len(features) * 0.3) nb_train = features[size:] nb_test = features[:size] classifier = nltk.NaiveBayesClassifier.train(nb_train) print "Classifier labels:", classifier.labels() print classifier.show_most_informative_features() print "Clasify test:", nltk.classify.accuracy(classifier, nb_test) #print classifier.classify(get_features(["Yolo", "bag", "sp"], False)) #random.shuffle(speakers) three_quarters = int(len(speakers) * 0.75) train = speakers[:three_quarters] test = speakers[three_quarters:] est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist) un_lm = NgramModel(1, train, estimator=est) bi_lm = NgramModel(2, train, estimator=est) tr_lm = NgramModel(3, train, estimator=est) qu_lm = NgramModel(4, train, estimator=est) pe_lm = NgramModel(5, train, estimator=est) print un_lm print bi_lm print tr_lm print qu_lm print pe_lm print "1 gram Perplexity:", un_lm.perplexity(test) print "2 gram Perplexity:", bi_lm.perplexity(test) print "3 gram Perplexity:", tr_lm.perplexity(test) print "4 gram Perplexity:", qu_lm.perplexity(test) print "5 gram Perplexity:", pe_lm.perplexity(test) print bi_lm.generate(10, ["uh", "sp"]) fd_fac = nltk.FreqDist(fac_words) vocab_fac = fd_fac.keys() fd_par = nltk.FreqDist(par_words) vocab_par = fd_par.keys() print "Fac Vocab: ", len(vocab_fac) print "Fac Tokens: ", len(fac_words) print vocab_fac[:20] print "Par Vocab: ", len(vocab_par) print "Par Tokens: ", len(par_words) print vocab_par[:20] fd_par.plot(50)
spans = [span for span in training_spans] training_offsets = [span[0] for span in spans] train = [] for s in spans: train.append(training_raw[s[0]:s[1]]) testing_spans = WhitespaceTokenizer().span_tokenize(testing_raw) spans = [span for span in testing_spans] testing_offsets = [span[0] for span in spans] test = [] for s in spans: test.append(testing_raw[s[0]:s[1]]) estimator = lambda fdist, bins: LidstoneProbDist(fdist, args. estimator_probability) lm = NgramModel(args.num_grams, train, estimator=estimator) t0 = 0 t1 = 1 current_best = '' while t1 < len(test): perplexity = lm.perplexity(test[t0:t1]) if perplexity > args.cutoff_max_perplexity: if (len(current_best) > 1): print current_best + '.' current_best = '' t0 = t1 + 1 t1 = t0 + 1 else: t1 += 1 if t1 - t0 > args.min_sentence_length and perplexity < args.output_max_perplexity:
#!/usr/bin/env python # -*- coding: utf-8 from nltk.corpus import PlaintextCorpusReader from nltk.tokenize import LineTokenizer from nltk.model import NgramModel from nltk.probability import LidstoneProbDist import pickle corpus_root = './data' fileids = 'data_title' example = ["Python", "is"] corpus = PlaintextCorpusReader(corpus_root, fileids, sent_tokenizer=LineTokenizer(), encoding='utf-8') est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(3, corpus.words(), estimator=est) sent = lambda n, example: ' '.join(lm.generate(n, example)) print "Let's make a sentence!!!" print "give a seed : Python is ..." print "sentence :" print sent(5, example)
#!/usr/bin/env python import nltk from nltk import bigrams from nltk import trigrams from nltk.probability import LidstoneProbDist from nltk.model import NgramModel with open('./austen/persuasion.txt', 'r') as training_file: raw = training_file.read() tokens = nltk.word_tokenize(raw) with open('./austen/sense_and_sensibility.txt', 'r') as test_file: test = test_file.read() test_list = nltk.word_tokenize(test) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) model = NgramModel(3, tokens,True,False,estimator) tri=model.entropy(test_list) print "tri-gram: " + str(tri) model = NgramModel(2, tokens,True,False,estimator) bi=model.entropy(test_list) print "bi-gram: " + str(bi)
# Read file f = io.open('/veu4/usuaris30/speech00/corpus/train/spanishlit_ninc_v' + version + '_nlm/' + categ + '.txt', encoding='utf8') g = f.read().lower() # Obtain tokenized words train = nltk.word_tokenize(g) print "e" # Remove rare words from the corpus # fdist = nltk.FreqDist(w for w in train) # vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) # train1 = map(lambda x: x if x in vocabulary else "*unknown*", train) # Obtain the Language Model using WittenBellProbDist to smooth unseen events estimator = lambda fdist, bins: WittenBellProbDist(fdist, 10) lm[categ] = NgramModel(N, train, estimator=estimator) print "> Obtain language model of", categ, "... Done!" print "> Obtain all language models... Done!" # Load dictionary with: {category:tests} n_categ = [] test_corpus = dict() for categ in all_categ: files = os.listdir( '/veu4/usuaris30/speech00/corpus/testc/spanishlit_ninc_v' + version + '_nlm/' + categ) n_categ.append(len(files)) tests = [] for fi in files: f = io.open('/veu4/usuaris30/speech00/corpus/testc/spanishlit_ninc_v' +
class MachineTranslation: PUNCTUATION = [',', '.', '(', ')', '?'] ENG_ADJECTIVE = ['JJ', 'JJR', 'JJS'] ENG_NOUN = ['NN', 'NNS', 'NNP', 'NNPS'] ENG_VERB = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] ESP_ADJECTIVE = ['a', 'q', 'o', '0', 'c', 's', 'f', 'p', 'n'] ESP_NOUN = ['n'] ESP_VERB = ['vm', 'vs'] ESP_VERB_PAST = ['vmii', 'vmis', 'vsii', 'vsis'] NUMBER_PAT = "\d+" OPEN_QUESTION_MARK = '\xc2\xbf' def __init__(self): cess_sents = cess.tagged_sents() self.uni_tag = ut(cess_sents) self.model = NgramModel(3, brown.words()) self.translation = [] self.dictionary = collections.defaultdict(lambda: 0) dictionaryFile = open("../corpus/Dictionary.txt", 'r') for translation in dictionaryFile: spanish, english = translation.split(" - ") spanish = spanish.decode('utf-8') self.dictionary[spanish] = collections.defaultdict(lambda: []) english = english.rstrip(';\n').split('; ') for pos in english: pos = pos.split(': ') self.dictionary[spanish][pos[0]] = pos[1].split(', ') self.sentences = [] sentencesFile = open("../corpus/TestSet.txt", 'r') for sentence in sentencesFile: self.sentences.append(sentence.rstrip('\n')) def translate(self): for sentence in self.sentences: sentenceTranslation = [] questionSwapped = sentence if sentence.startswith(self.OPEN_QUESTION_MARK): questionSwapped = self.questionSwap(sentence) negationSwapped = self.negationSwap(questionSwapped) tokens = nltk.word_tokenize(negationSwapped) pos = self.uni_tag.tag(tokens) for word in pos: candidate = word[0].decode('utf-8').lower() # print candidate if candidate in self.PUNCTUATION or re.search( self.NUMBER_PAT, candidate): wordTranslation = candidate elif (word[1] and any(word[1].startswith(adj) for adj in self.ESP_ADJECTIVE) and 'adjective' in self.dictionary[candidate]): wordTranslation = self.dictionary[candidate]['adjective'][ 0] elif (word[1] and any(word[1].startswith(noun) for noun in self.ESP_NOUN) and 'noun' in self.dictionary[candidate]): wordTranslation = self.dictionary[candidate]['noun'][0] if word[1][1] == 'p': # proper noun wordTranslation = wordTranslation.capitalize() elif (word[1] and any(word[1].startswith(verb) for verb in self.ESP_VERB) and 'verb' in self.dictionary[candidate]): #wordTranslation = self.verbConjugation(candidate, word) wordTranslation = self.pluralADJ(candidate) else: wordTranslation = self.pluralADJ(candidate) sentenceTranslation.append(wordTranslation) directTranslation = " ".join(map(str, sentenceTranslation)) adjNounSwapped = self.adjNounSwap(directTranslation) lm = self.ngram(adjNounSwapped) nounSwapped = self.nounSwap(lm) pronounAdded = self.addPronoun(nounSwapped) possessives = self.possessive(pronounAdded) removedDeterminers = self.removeDeterminers(possessives) capAndNum = self.capitalizationAndNumbers(removedDeterminers) removeExtraSpace = re.sub(r' \'s', '\'s', capAndNum) removeExtraSpace = re.sub(r' ,', ',', removeExtraSpace) if removeExtraSpace[-2:] == " .": removeExtraSpace = removeExtraSpace[:-2] + "." elif removeExtraSpace[-2:] == " ?": removeExtraSpace = removeExtraSpace[:-2] + "?" self.translation.append(removeExtraSpace) # if question is a yes or no question, swap the order of first two words def questionSwap(self, sentence): sentence = sentence.lstrip(self.OPEN_QUESTION_MARK) #tokens = nltk.word_tokenize(sentence) #pos = self.uni_tag.tag(tokens) #return " ".join(map(str, tokens)) return sentence # reverse the order of negation words and their objects def negationSwap(self, sentence): tokens = nltk.word_tokenize(sentence) pos = self.uni_tag.tag(tokens) firstWord = pos[0] for i, word in enumerate(pos[1:]): if firstWord[0].lower() == "no" and word[1] is not None and ( word[1].startswith('vs') or word[1].startswith('vm')): tokens[i] = tokens[i + 1] tokens[i + 1] = "not" firstWord = word firstWord = pos[0] secondWord = pos[1] for i, word in enumerate(pos[2:]): if firstWord[0].lower() == "no" and secondWord[ 1] is not None and secondWord[1].startswith('pp'): if word[1] is not None and (word[1].startswith('vs') or word[1].startswith('vm')): temp = tokens[i] tokens[i] = tokens[i + 1] tokens[i + 1] = "do " + temp firstWord = secondWord secondWord = word return " ".join(map(str, tokens)) # switch position of possessive words to use apostrophe notation def possessive(self, sentence): tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) removeOf = [] firstWord = pos[0] secondWord = pos[1] for i, word in enumerate(pos[2:]): if firstWord[1] in self.ENG_NOUN and secondWord[ 0] == 'of' and word[1] in ['NNP', 'NNPS']: temp = tokens[i] tokens[i] = tokens[i + 2] + "'s" tokens[i + 2] = temp removeOf.append(i + 1) firstWord = secondWord secondWord = word if len(removeOf) != 0: for i in reversed(removeOf): tokens.pop(i) return " ".join(map(str, tokens)) # fixes the "number of telephone" to "telephone number" example def nounSwap(self, sentence): tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) removeOf = [] firstWord = pos[0] secondWord = pos[1] for i, word in enumerate(pos[2:]): if firstWord[1] in [ 'NN', 'NNS' ] and secondWord[0] == 'of' and word[1] in ['NN', 'NNS']: temp = tokens[i] tokens[i] = tokens[i + 2] tokens[i + 2] = temp removeOf.append(i + 1) firstWord = secondWord secondWord = word if len(removeOf) != 0: for i in reversed(removeOf): tokens.pop(i) return " ".join(map(str, tokens)) def ngram(self, sentence): words = ['your', 'its', 'his', 'her', 'their'] highestProb = 0 highestSentence = sentence for word in words: candidateSentence = re.sub('your', word, sentence) prob = self.model.prob(word, [candidateSentence]) if prob > highestProb: highestProb = prob highestSentence = candidateSentence return highestSentence # reverses order of adjacent adjectives and nouns def adjNounSwap(self, sentence): tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) firstWord = pos[0] for i, word in enumerate(pos[1:]): if firstWord[1] in self.ENG_NOUN and word[1] in self.ENG_ADJECTIVE: temp = tokens[i] tokens[i] = tokens[i + 1] tokens[i + 1] = temp firstWord = word return " ".join(map(str, tokens)) def addPronoun(self, sentence): tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) firstWord = pos[0] for i, word in enumerate(pos[1:]): if firstWord[1] not in self.ENG_NOUN and firstWord[ 0] != 'have' and firstWord[1] not in [ 'DT', 'TO', 'WP', 'RB', 'PRP', 'VBZ', '.', ',' ] and (word[1] in self.ENG_VERB or word[0] == 'have'): if firstWord[1] == 'VBP' or (wordnet.synsets(word[0]) and not word[0].endswith('s')): tokens[i + 1] = "they " + tokens[i + 1] else: tokens[i + 1] = "it " + tokens[i + 1] firstWord = word if pos[0][1] in self.ENG_VERB or pos[0][0] == 'have': if pos[0][1] == 'VBP' or (wordnet.synsets(word[0]) and not word[0].endswith('s')): tokens[0] = "They " + tokens[0] else: tokens[0] = "It " + tokens[0] return " ".join(map(str, tokens)) def pluralADJ(self, token): translation = self.dictionary[token]['default'][0] pos = self.uni_tag.tag(nltk.word_tokenize(token)) if pos[0][1] is not None and pos[0][1].startswith( 'a') and 'p' in pos[0][1]: if translation.endswith('s'): if wordnet.synsets(translation[:-1]): translation = translation[:-1] return translation def removeDeterminers(self, sentence): tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) removeOf = [] firstWord = pos[0] for i, word in enumerate(pos[1:]): if firstWord[1] in ['DT'] and word[1] in ['NNP', 'NNPS', 'NNS']: removeOf.append(i) firstWord = word if len(removeOf) != 0: for i in reversed(removeOf): tokens.pop(i) return " ".join(map(str, tokens)) def capitalizationAndNumbers(self, sentence): tokens = nltk.word_tokenize(sentence) tokens[0] = tokens[0].capitalize() pos = nltk.pos_tag(tokens) days = [ 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday' ] months = [ 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december' ] for i, word in enumerate(pos): if word[1] in ['NNP', 'NNPS']: tokens[i] = tokens[i].capitalize() if word[1] in ['CD']: tokens[i] = re.sub(r'\.', ',', tokens[i]) if word[0] in days or word[0] in months: tokens[i] = tokens[i].capitalize() newTokens = [] for i, token in enumerate(tokens): if token.lower() == 'a' and i + 1 < len(tokens): if any(tokens[i + 1].startswith(vp) for vp in ['a', 'e', 'i', 'o', 'u']): if i == 0: newTokens.append('An') else: newTokens.append('an') else: newTokens.append(token) else: newTokens.append(token) return " ".join(map(str, newTokens)) def verbConjugation(self, candidate, word): wordTranslation = en.verb.present( self.dictionary[candidate]['verb'][0], person=word[1][4]) if word[1][2] == 'p': wordTranslation = en.verb.present_participle( self.dictionary[candidate]['verb'][0]) if word[1][3] == 's': wordTranslation = en.verb.past( self.dictionary[candidate]['verb'][0], person=word[1][4]) if word[1][2] == 'p': wordTranslation = en.verb.past_participle( self.dictionary[candidate]['verb'][0]) return wordTranslation
from nltk.probability import LidstoneProbDist from nltk.model import NgramModel from nltk.tokenize import word_tokenize, wordpunct_tokenize # Tokenizer if __name__ == "__main__": # add language tTwit = list(brown.words()) # tTwit.extend(list(cess_cat.words())) # estimator for smoothing the N-gram model estimator1 = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) tokens1 = list(brown.words()) # N-gram language model with 3-grams model = NgramModel(3, tokens1, estimator=estimator1) twit = sys.argv[1] # posVars = sys.argv[2] # pos = sys.argv[3] posVars = list() tmpVars = list() for i in range(2, len(sys.argv)): posVars.append(sys.argv[i]) # print sys.argv[i] tTwit = word_tokenize(twit) # tokens2 = word_tokenize(posVars) # print 'twit ' + ' '.join(tTwit) print "posVars " + " ".join(posVars)
# Import the corpus and functions used from nltk library from nltk.corpus import brown; from nltk.corpus import genesis from nltk.probability import LidstoneProbDist from nltk.model import NgramModel # Tokens contains the words for Genesis and Reuters Trade #tokens = list(genesis.words('english-kjv.txt')) #tokens.extend(list(reuters.words(categories = 'trade'))) # estimator for smoothing the N-gram model estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) # N-gram language model with 3-grams #model = NgramModel(3, tokens, estimator) model = NgramModel(3, brown.words(categories='news'), estimator) #model = NgramModel(3, tokens) # Apply the language model to generate 50 words in sequence text_words = model.generate(50) # Concatenate all words generated in a string separating them by a space. text = ' '.join([word for word in text_words]) # print the text print text print model.prob('repayments', ['international', 'debt']);
for item in sent: item=item.lower() for entry in list(item): char_list.append(entry) char_list.append(' ') myCorpus=char_list # Remove rare words from the corpus fdist = nltk.FreqDist(w for w in myCorpus) vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) myCorpus = map(lambda x: x if x in vocabulary else "*unknown*", myCorpus) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm_useful = NgramModel(NVAL, myCorpus, estimator=estimator) print "Useful reviews model complete" myCorpusReader = nltk.corpus.reader.PlaintextCorpusReader(TRAIN_DATA_PATH,NOTUSEFUL_REVIEWS_FILE) myCorpus = [word.lower() for word in myCorpusReader.words()] for sent in myCorpusReader.sents(): for item in sent: item=item.lower() for entry in list(item): char_list.append(entry) char_list.append(' ') myCorpus=char_list # Remove rare words from the corpus fdist = nltk.FreqDist(w for w in myCorpus)
def demo(): from nltk.corpus import treebank #from nltk.probability import LidstoneProbDist #from nltk.probability import WittenBellProbDist from nltk.probability import SimpleGoodTuringProbDist from nltk.model import NgramModel estimator = lambda fdist, bins: SimpleGoodTuringProbDist(fdist, len(fdist)+1) #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) #estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) tag_corpus = [] for (word,tag) in treebank.tagged_words(): tag_corpus.append(tag) lm = NgramModel(2, tag_corpus, estimator) print lm lm1 = NgramModel(1, tag_corpus, estimator) print lm1 print tag_corpus[:20] sent = "NN" print lm1.entropy(sent) sent = "DT " print lm1.entropy(sent) sent = "VBZ" print lm1.entropy(sent) sent = "JJ" print lm1.entropy(sent) sent = "RB" print lm1.entropy(sent) sent = "DT NN" print lm.entropy(sent)
def trainModel(): totalwords = abc.words() #+ genesis.words() + gutenberg.words() + webtext.words() estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) BigramModel = NgramModel(2, totalwords) UnigramModel = NgramModel(1, totalwords) return (UnigramModel, BigramModel)
from nltk.util import ngrams from nltk.corpus import reuters from nltk.corpus import genesis from nltk.probability import LaplaceProbDist from nltk.model import NgramModel import nltk sentence = 'She covered a Bob Dylan song for Amnesty International.' ## http://www.inf.ed.ac.uk/teaching/courses/icl/nltk/probability.pdf ## http://www.nltk.org/book/ch02.html n = 2 bigrams = ngrams(sentence.split(), n) print bigrams ## Append starting points and ending points #for grams in sixgrams: # print grams estimator = lambda fdist, bins: LaplaceProbDist(fdist, len(sentence.split())+1) model = NgramModel(2,sentence.split(),estimator=estimator) print model.generate(1, ("her","take")) print print model.entropy(["she","covered"])
''' Created on 7 Nov 2009 @author: joh ''' from nltk.model import NgramModel from nltk.probability import LidstoneProbDist text = 'hi how are you do you like fudge you like cookies' model = NgramModel(3, text.split(), LidstoneProbDist) print model.prob('you', ('how','are')) print model.prob('you', ('how','do'))
training_spans = WhitespaceTokenizer().span_tokenize(training_raw) spans = [span for span in training_spans] training_offsets = [span[0] for span in spans] train = [] for s in spans: train.append(training_raw[s[0]:s[1]]) testing_spans = WhitespaceTokenizer().span_tokenize(testing_raw) spans = [span for span in testing_spans] testing_offsets = [span[0] for span in spans] test = [] for s in spans: test.append(testing_raw[s[0]:s[1]]) estimator = lambda fdist, bins: LidstoneProbDist(fdist, args.estimator_probability) lm = NgramModel(args.num_grams, train, estimator=estimator) t0 = 0 t1 = 1 current_best='' while t1 < len(test): perplexity = lm.perplexity(test[t0:t1]) if perplexity > args.cutoff_max_perplexity: if (len(current_best)>1): print current_best+'.' current_best='' t0 = t1 + 1 t1 = t0 + 1 else: t1 += 1 if t1-t0 > args.min_sentence_length and perplexity < args.output_max_perplexity:
if parsed.word_type == "stem": stemmer = Stemmer.Stemmer('russian') words += stemmer.stemWords([inp]) elif parsed.word_type == "surface_all": words += nltk.word_tokenize(inp) elif parsed.word_type == "surface_no_pm" or parsed.word_type[:7] == "suffix_": inp = inp.translate(None, string.punctuation) words += nltk.word_tokenize(inp) else: words += nltk.word_tokenize(inp) if parsed.word_type[:7] == "suffix_": l = int(parsed.word_type.split("_")[1]) words = [x[-l:] for x in words] if parsed.unknown_word_freq: unknown_words = [] # print "Removing unknown words" fq = FreqDist(words) for w, count in fq.iteritems(): if count < parsed.unknown_word_freq: unknown_words += w words[:] = [x if x not in unknown_words else "<UNK>" for x in words] lm = NgramModel(n, words, estimator=estimator) outf = open(output, "wb") dill.dump(lm, outf ,protocol= 2) outf.close()
#!/usr/bin/env python import nltk from nltk import bigrams from nltk import trigrams from nltk.probability import LidstoneProbDist from nltk.model import NgramModel with open('./austen/persuasion.txt', 'r') as training_file: raw = training_file.read() tokens = nltk.word_tokenize(raw) with open('./austen/sense_and_sensibility.txt', 'r') as test_file: test = test_file.read() test_list = nltk.word_tokenize(test) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) model = NgramModel(3, tokens, True, False, estimator) tri = model.entropy(test_list) print "tri-gram: " + str(tri) model = NgramModel(2, tokens, True, False, estimator) bi = model.entropy(test_list) print "bi-gram: " + str(bi)
forbidden_words = forbidden_words_doc.readline() # список предлогов и других служебных слов prep_doc = codecs.open(scriptdir + u'prepos_list.txt', u'r', encoding='utf-8') prepositions = [] for line in prep_doc: if len(line) > 0: prepositions.append(strip_string(line)) # Представляем входной текст в виде списка токенов all_tokens = tokenize(text) # строим языковую модель ngrams = 10 estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.1) model = NgramModel(ngrams, all_tokens, estimator=estimator) print 'Language model built.' # рандомно генерируем первое слово random_word = generate_first_word(all_tokens) meisterwerk = [random_word] # генерируем 4 строки. Количество слогов по строкам: 9/8/9/8 first_line = generate_line(meisterwerk, 9) for word in first_line: meisterwerk.append(word) print '1st line generated.' second_line = generate_line(meisterwerk, 8) for word in second_line: meisterwerk.append(word)
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.probability import LidstoneProbDist, WittenBellProbDist from nltk.model import NgramModel from nltk.tokenize import sent_tokenize, word_tokenize corpusdir = 'corpora/' # Directory of corpus. SickCorpus = PlaintextCorpusReader(corpusPath, 'sick_tweets.txt') HealthyCorpus = PlaintextCorpusReader(corpusdir, 'healthy_tweets.txt') estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) sick_model_1 = NgramModel(1, SickCorpus.sents(), True, False, estimator) sick_model_2 = NgramModel(2, SickCorpus.sents(), True, False, estimator) healthy_model_1 = NgramModel(1, HealthyCorpus.sents(), True, False, estimator) healthy_model_2 = NgramModel(2, HealthyCorpus.sents(), True, False, estimator) tweet = "Remember when we were all diagnosed with Bieber fever ? Lol" print "sick_model_1 is: " + str(sick_model_1.perplexity(word_tokenize(tweet))) print "sick_model_2 is: " + str(sick_model_2.perplexity(word_tokenize(tweet))) print "healthy_model_1 is: " + str(healthy_model_1.perplexity(word_tokenize(tweet))) print "healthy_model_2 is: " + str(healthy_model_2.perplexity(word_tokenize(tweet)))
tamil2_alpha = [] tamil2_alpha_all = [] for line in tamil1f.readlines()[1:]: tamil1_alpha.append(["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"]) tamil1_alpha_all += ["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"] for line in tamil2f.readlines()[1:]: tamil2_alpha.append(["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"]) tamil2_alpha_all += ["St"]+line.split(",")[-1].replace("\n","").replace("[","").replace("]","").replace("\r","").split("|")+["En"] s_bg1 = nltk.bigrams(tamil1_alpha_all) s_bg2 = nltk.bigrams(tamil2_alpha_all) fdist1 = nltk.FreqDist(s_bg1) fdist2 = nltk.FreqDist(s_bg2) estimator1 = lambda fdist, bins: LaplaceProbDist(fdist, len(tamil1_alpha_all)+1) estimator2 = lambda fdist, bins: LaplaceProbDist(fdist, len(tamil2_alpha_all)+1) model1 = NgramModel(3,tamil1_alpha_all,estimator=estimator1) model2 = NgramModel(3,tamil2_alpha_all,estimator=estimator2) print model1.entropy(tamil1_alpha[0]) print model1.perplexity(tamil1_alpha[0])