def buildTransitionMatrix(self, tagged_corpus: list, train_size): train = tagged_corpus[:int(train_size * len(tagged_corpus))] random.shuffle(train) #construction of the transition matrix transition = ConditionalFreqDist() for (tag1, tag2) in train: if tag1 not in transition: transition[tag1] = FreqDist() if tag2 not in transition[tag1]: transition[tag1][tag2] = 0.0 transition[tag1][tag2] += 1 for tag in transition.keys(): somme = 0.0 for value in transition[tag].values(): somme += value for successor in transition[tag].keys(): transition[tag][successor] = round( float("{0:.6f}".format(transition[tag][successor] / somme)), 6) self.TRANSITION_MATRIX = transition return transition
def train_model(): """Create ngram model from Project Gutenberg texts""" text = '' for corpus in CORPORA: with open(corpus, 'r') as file_: text += file_.read().replace('\n', '') sents = sent_tokenize(text.lower()) tokens = [] # appends <start> and <end> tokens to each sentence for sent in sents: sent = 'START ' + sent + ' END' tokens += word_tokenize(sent) ngrams_ = tuple(ngrams(tokens, N_VAL)) # bigram frequency distribution bi_cfdist = ConditionalFreqDist((ngram[0], ngram[:2]) for ngram in ngrams_) # bigram probability distribution bi_cpdist = ConditionalProbDist(bi_cfdist, LaplaceProbDist) # conditional frequency distribution cfdist = ConditionalFreqDist( (ngram[:N_MINUS1], ngram) for ngram in ngrams_) # conditional probability cpdist = ConditionalProbDist(cfdist, LaplaceProbDist) return bi_cpdist, cpdist
def constructTransitionMatrix(self, sourceFilesList: list): #construction of the transition matrix for fileName in sourceFilesList: file = open(fileName, 'r', encoding="windows-1256") fileFinal = "" for line in file: line = line.upper() if (len(line) > 1): if not line.startswith("<S>"): fileFinal += '<S> ' + line[:-1] + ' <E>\n' else: fileFinal += line[:-1] + '\n' file.close() tokens = [el for el in re.split("[\s\n]+", fileFinal) if el != ''] self.initialProbabilities = FreqDist([ tokens[i] for i in range(1, len(tokens)) if tokens[i - 1] == '<S>' ]) self.tags = list(set(tokens)) self.bigramDist = FreqDist(list(bigrams(tokens))) Trigrams = list(trigrams(tokens)) cfd = ConditionalFreqDist(((el[2], (el[0], el[1])) for el in Trigrams)) for word in cfd.conditions(): for bigram in cfd[word]: cfd[word][bigram] = round( float("{0:.6f}".format(cfd[word].freq(bigram))), 6) self.TRANSITION_MATRIX = cfd return cfd
def __init__(self, fileid): try: # Reads the UDHR file corpus = udhr.raw(fileid) except: print("UDHR language file " + fileid + " does not exist", file=sys.stderr) sys.exit(1) # Generate training dataset, lowercase and newlines converted to space self.train = re.sub(r'[\n]+', ' ', corpus[0:1000].strip().lower()) # Generate dev dataset self.dev = corpus[1000:1100] # Convert training words to single characters tokens = list(self.train) self.unigram = tokens self.bigram = list(nltk.bigrams(tokens)) self.trigram = list(nltk.trigrams(tokens)) # Generate unigram frequency distirbution self.unigramFreq = FreqDist(self.unigram) # Generate bigram frequency distribution self.bigramFreq = ConditionalFreqDist(self.bigram) # Generate trigram frequency distribution self.trigramFreq = ConditionalFreqDist( list(((w0, w1), w2) for w0, w1, w2 in self.trigram))
def build_top_words(self): pos_reviews = [(review, c) for (review, c) in self.documents if c == 'pos'] neg_reviews = [(review, c) for (review, c) in self.documents if c == 'neg'] pos_words = [token for (review, c) in pos_reviews for token in review] neg_words = [token for (review, c) in neg_reviews for token in review] fd_all = FreqDist(pos_words + neg_words) pos_class_words = [('pos', word) for word in pos_words] neg_class_words = [('neg', word) for word in neg_words] cfd_pos = ConditionalFreqDist(pos_class_words) cfd_neg = ConditionalFreqDist(neg_class_words) pos_word_count = len(pos_words) neg_word_count = len(neg_words) total_word_count = pos_word_count + neg_word_count word_scores = {} for (word, freq) in fd_all.items(): pos_score = BigramAssocMeasures.chi_sq(cfd_pos['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cfd_neg['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), reverse=True, key=lambda x: x[1])[:1000] self.top_words = set([w for w, s in best])
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for category, words in wordsInCategories: word_fd.update(words) label_word_fd[category].update(words) word_counts = {} for condition in label_word_fd.conditions(): word_counts[condition] = label_word_fd[condition].N() total_word_count = 0 for condition, count in word_counts.items(): total_word_count += count word_scores = {} for word, freq in word_fd.items(): score = 0 for condition, count in word_counts.items(): score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count) word_scores[word] = score best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words] return set([w for w, s in best])
def visualize_monthly_news_stats2(csvfolder=metacorpus.statspath, csvname=metacorpus.prunedmetafilename, imgoutpath=metacorpus.imgfolder, rescatmap=metacorpus.resourcecategorymap2): colldf = IOtools.readcsv(csvfolder+os.sep+csvname) numoftexts, _ = colldf.values.shape # daily news counts for resources cfddailyresourcecount = ConditionalFreqDist((colldf.loc[i,"date"], colldf.loc[i,"resource"].strip()) for i in range(numoftexts)) CFDhelpers.cfd2csv(cfddailyresourcecount, csvfolder+os.sep+"cfddailyresourcecount2.csv", colnames=['date','resource','count']) #cfdresourcesdaycount = ConditionalFreqDist((resource, day) for day in cfddailyresourcecount.conditions() for resource in list(cfddailyresourcecount[day])) # daily news counts for categories cfddailycategorycount = ConditionalFreqDist((colldf.loc[i,"date"], "_".join(map(lambda x : str(x).strip(), [colldf.loc[i, "resource"], colldf.loc[i, "category"]]))) for i in range(numoftexts)) CFDhelpers.cfd2csv(cfddailycategorycount, csvfolder+os.sep+"cfddailycategorycount2.csv", ["date", "category", 'count']) #cfdcatsdaycount = ConditionalFreqDist((category, date) for date in cfddailycategorycount.conditions() for category in list(cfddailycategorycount[date])) # visualize monthly --- assuming the dates are of the form yyyy-mm-dd -we did it so while recording cfdmonthlyresourcecount = ConditionalFreqDist((colldf.loc[i,"date"][:-3], colldf.loc[i,"resource"].strip()) for i in range(numoftexts)) CFDhelpers.cfd2csv(cfdmonthlyresourcecount, csvfolder+os.sep+"cfdmonthlyresourcecount.csv", colnames=['month','resource','count']) #cfdresourcesmonthcount = ConditionalFreqDist((resource, month) for month in cfdmonthlyresourcecount.conditions() for resource in list(cfdmonthlyresourcecount[month])) imgpath = IOtools.ensure_dir(os.path.join(imgoutpath, "resourcebasednewscount")) visualize_monthly_cfd(cfd=cfdmonthlyresourcecount, figuretitle="Monthly news count for each resource", ylabel="news published", imgoutpath=imgpath) cfdmonthlycategorycount = ConditionalFreqDist((colldf.loc[i,"date"][:-3], "-".join(map(lambda x : str(x).strip(), [colldf.loc[i, "resource"], colldf.loc[i, "category"]]))) for i in range(numoftexts)) CFDhelpers.cfd2csv(cfdmonthlycategorycount, csvfolder+os.sep+"cfdmonthlycategorycount.csv", ["month", "category", 'count']) #cfdcatsmonthcount = ConditionalFreqDist((category, month) for month in cfdmonthlycategorycount.conditions() for category in list(cfdmonthlycategorycount[month])) imgpath = IOtools.ensure_dir(os.path.join(imgoutpath, "categorybasednewscount")) for canoniccatname, rescatnamedct in rescatmap.iteritems(): monthresourcepairs = [] for resourcename, origcats in rescatnamedct.iteritems(): for origcatname in origcats: #resourcename = rescat.split("-")[0] rescat = "-".join([resourcename, origcatname]) for month in cfdmonthlycategorycount.conditions(): numofoccurrences = cfdmonthlycategorycount[month][rescat] #print resourcename," had ",numofoccurrences," times texts in :",rescat," during ",month for i in range(numofoccurrences): monthresourcepairs.append((month, resourcename)) cfdmonthlyresourcecount_percat = ConditionalFreqDist(monthresourcepairs) print canoniccatname,resourcename," * ",rescat," : ",len(cfdmonthlyresourcecount_percat.conditions())," ",cfdmonthlyresourcecount_percat.N() figuretitle = "Monthly news count of each resource over category "+canoniccatname.upper() visualize_monthly_cfd(cfdmonthlyresourcecount_percat, figuretitle, ylabel="news published", imgoutpath=imgpath)
def __init__(self, corpus): """Initializer of the BigramWordCandidateProvider. Args: corpus: An iterable of word strings. """ _bigrams = bigrams(corpus) self._cfd = ConditionalFreqDist(_bigrams)
def calculate_vector_spaces(self,k=16): cfd = ConditionalFreqDist( (word, doc['document']) for doc in self.mongo[CORPUS_CLN].find() for word in self.interestingWords(doc['document'])) cfd.tabulate() # matrix dimensions terms = [c for c in cfd.conditions()] # conditions = words docs = sorted(set(v for c in cfd.conditions() for v in cfd[c])) self.log("terms: %s"%str(terms)) self.log("docs: %s"%str(docs)) term_by_doc_mat = np.zeros(shape=(len(terms),len(docs))) self.log("Term-by-ref-document matrix shape is: %d X %d"%(len(terms),len(docs))) for i, term in enumerate(terms): li = np.array([cfd[term][doc] for doc in docs]) term_by_doc_mat[i] = li self.log("Matrix\n%s"%str(term_by_doc_mat)) # perform singular value decomposition u,sigma,vh = self._do_svd(term_by_doc_mat,k) del term_by_doc_mat # don't need the matrix anymore # map terms to svd space terms_space = np.zeros(shape=(len(terms),k)) for i in xrange(len(terms)): vals = [u[i][j] * sigma[j] for j in range(k)] # x-coord = row i, column 1 terms_space[i] = np.array(vals) # map docs to svd space docs_space = np.zeros(shape=(len(docs),k)) for i in xrange(len(docs)): vals = [ vh[i][j] * sigma[j] for j in range(k)] docs_space[i] = np.array(vals) # store matrix data row = self.mongo['data'].find_one() if not row: row = {'terms': terms, 'documents':docs, 'terms_subspace':terms_space.tolist(), 'docs_subspace':docs_space.tolist(), 'u':u.tolist(), 'sigma':sigma.tolist(), 'vh':vh.tolist(), 'date':datetime.utcnow()} else: row['terms'] = terms row['documents'] = docs row['terms_subspace'] = terms_space.tolist() row['docs_subspace'] = docs_space.tolist() row['u'] = u.tolist() row['sigma'] = sigma.tolist() row['vh'] = vh.tolist() row['date'] = datetime.utcnow() self.mongo['data'].save(row) self.log("Saved matrix data")
def __init__(self, n, training_data): """Create an n order model using training_data.""" # Set n and train self._n = n train_ngrams = _make_ngram_tuples(training_data, self._n) self._cfd = ConditionalFreqDist( (context, event) for (context, event) in train_ngrams) self._estimators = dict((context, self._cfd[context]) for context in self._cfd.conditions())
def conditional_freq(self): result = [] cfd = ConditionalFreqDist(self.bigram_list) for key, values in cfd.items(): for word, freq in values.items(): result.append((key, word, freq)) return result
def tabulateWordsInAllGeners(self, theWords): """ find the distribution of a word within all Brown corpus genres @params theWord: the word/list of words to find info about """ cdf = ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) cdf.tabulate(samples=theWords, conditions=brown.categories())
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ Initialize this C{ContextTagger}'s L{_context_to_tag} table based on the given training data. In particular, for each context C{I{c}} in the training data, set C{_context_to_tag[I{c}]} to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of C{self._context_to_tag} (if any) is discarded. @param tagged_corpus: A tagged corpus. Each item should be a C{list} of C{(word, tag)} tuples. @param cutoff: If the most likely tag for a context occurs fewer than C{cutoff} times, then exclude it from the context-to-tag table for the new tagger. """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context].inc(tag) # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() hits = fd[context][best_tag] if hits > cutoff: self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning)
def freq_dist_words(self): from nltk import ConditionalFreqDist from nltk.model import NgramModel categories = ['rev_neg.pos','rev_pos.pos'] cfd = ConditionalFreqDist((category, word) for category in categories for word in c.ngrams(c.reader.words(category))) genres = ['rev_neg.pos', 'rev_pos.pos'] modals = ['location','room','size','staff','excellent','poor','good','bad'] print 'neg :', cfd.__getitem__('rev_neg.pos') print 'pos :', cfd.__getitem__('rev_pos.pos')
def train_supervised(self, labelled_sequences, **kwargs): """ Supervised training maximising the joint probability of the symbol and state sequences. This is done via collecting frequencies of transitions between states, symbol observations while within each state and which states start a sentence. These frequency distributions are then normalised into probability estimates, which can be smoothed if desired. @return: the trained model @rtype: HiddenMarkovModelTagger @param labelled_sequences: the training data, a set of labelled sequences of observations @type labelled_sequences: list @param kwargs: may include an 'estimator' parameter, a function taking a C{FreqDist} and a number of bins and returning a C{ProbDistI}; otherwise a MLE estimate is used """ # default to the MLE estimate estimator = kwargs.get('estimator') if estimator == None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurences of starting states, transitions out of each state # and output symbols observed in each state starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[_TAG] symbol = token[_TEXT] if lasts == None: starting.inc(state) else: transitions[lasts].inc(state) outputs[state].inc(symbol) lasts = state # update the state and symbol lists if state not in self._states: self._states.append(state) if symbol not in self._symbols: self._symbols.append(symbol) # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, False, N) B = ConditionalProbDist(outputs, estimator, False, len(self._symbols)) return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
def __init__(self, file): corpus = udhr.raw(file) self.training_set = corpus[0:1000] token = list(self.training_set) self.unigram = token self.bigram = list(nltk.bigrams(token)) self.trigram = list(nltk.trigrams(token)) self.unigram_frequency = FreqDist(self.unigram) self.bigram_frequency = ConditionalFreqDist(self.bigram) self.trigam_frequency = ConditionalFreqDist( list(((x, y), z) for x, y, z in self.trigram))
def tabulateWordsInPeriods(self, theWords): """ find the distribution of words within the years, based in Inaugural corpus @params theWords: the word/list of words to find info about """ cdf = ConditionalFreqDist((textid[:4], target) for textid in inaugural.fileids() for word in inaugural.words(textid) for target in theWords if word.lower().startswith(target) or word.lower().endswith(target)) cdf.tabulate()
def subword_char_ngram(text_fileid_map, n): corpus_ngramitems = [] for tid, text in text_fileid_map.iteritems(): words = text.split() ngramitems = [] for w in words: ngramitems.extend(ngrams(w, n)) for ngramitem in ngramitems: corpus_ngramitems.append((tid, ngramitem)) cfd = ConditionalFreqDist(corpus_ngramitems) print cfd.N()," ",len(cfd.conditions()) return cfd CFDhelpers.cfd2csv(cfd=cfd, csvpath=csvpath)
def __init__(self, corpura): corpus = udhr.raw(corpura) self.TrainingSet = corpus[0:1000] token = list(self.TrainingSet) self.Uni = token self.Bi = list(nltk.bigrams(token)) self.Tri = list(nltk.trigrams(token)) self.UniFreq = FreqDist(self.Uni) self.BiFreq = ConditionalFreqDist(self.Bi) self.TriFreq = ConditionalFreqDist( list(((w1, w2), w3) for w1, w2, w3 in self.Tri))
def postags(self, pos=None, sort=False, top=0, universal_tagset=False, ret_cond=False): '''Создает частотные словари или отсортированные по частоте списки частей речи''' def merge(tags): result = FreqDist() for tag in tags: result += cfd[tag] return result maps = { 'NOUN': {'NN', 'NNS', 'NNP', 'NNPS'}, 'VERB': {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}, 'ADJ': {'JJ', 'JJR', 'JJS'}, 'ADV': {'RB', 'RBR' 'RBS'}, } cfd = ConditionalFreqDist() for sent in self._sents: #tokens = sent.untagging() tokens = sent.tags for tok, tag, lemma in tokens: cfd[tag][lemma.lower()] += 1 cond = cfd.conditions() result = cfd if pos: if not universal_tagset and pos in maps: result = merge(maps[pos]) else: result = cfd[pos] if top: result = _top(result, top) else: result = _sort(result, sort) if ret_cond: result = result, cond return result
def __init__(self, sentences): # FIXME should use smoothing here. I tried SimpleGoodTuringProbDist but # it returns zero probability for event with freq=1. Possibly due to # too small test corpus self.cfd = ConditionalFreqDist( (ngram[:-1], ngram[-1]) for sentence in sentences for ngram in ngrams(sentence, 3, pad_left=True))
def __init__(self, n, training_data): """Create an n order model using training_data.""" # Set n and train self._n = n train_ngrams = _make_ngram_tuples(training_data, self._n) self._cfd = ConditionalFreqDist((context, event) for (context, event) in train_ngrams) self._estimators = dict((context, self._cfd[context]) for context in self._cfd.conditions())
class NgramModel(object): """A simple N-gram model.""" def __init__(self, n, training_data): """Create an n order model using training_data.""" # Set n and train self._n = n train_ngrams = _make_ngram_tuples(training_data, self._n) self._cfd = ConditionalFreqDist((context, event) for (context, event) in train_ngrams) self._estimators = dict((context, self._cfd[context]) for context in self._cfd.conditions()) def prob(self, event, context): """Return the probability for an event in the provided context""" context = tuple(context) try: return self._estimators[context].freq(event) except KeyError: return 0.0 def seqprob(self, seq): """Return the probability of a sequence.""" prob = 1.0 for context, event in _make_ngram_tuples(seq, self._n): prob *= self.prob(event, context) return prob def allngrams(self): """Return all N-grams observed by the model and their probabilities.""" ngram_probs = ( (event, context, self.prob(event, context)) for context, dist in self._estimators.items() for event in dist ) return sorted(ngram_probs, key=itemgetter(1))
def get_bigrams(self, text): list_bigrams = bigrams(text) cfd = ConditionalFreqDist(list_bigrams) list = [] for i in cfd: list.append(cfd[i]) return list
class NgramModel(object): """A simple N-gram model.""" def __init__(self, n, training_data): """Create an n order model using training_data.""" # Set n and train self._n = n train_ngrams = _make_ngram_tuples(training_data, self._n) self._cfd = ConditionalFreqDist( (context, event) for (context, event) in train_ngrams) self._estimators = dict((context, self._cfd[context]) for context in self._cfd.conditions()) def prob(self, event, context): """Return the probability for an event in the provided context""" context = tuple(context) try: return self._estimators[context].freq(event) except KeyError: return 0.0 def seqprob(self, seq): """Return the probability of a sequence.""" prob = 1.0 for context, event in _make_ngram_tuples(seq, self._n): prob *= self.prob(event, context) return prob def allngrams(self): """Return all N-grams observed by the model and their probabilities.""" ngram_probs = ((event, context, self.prob(event, context)) for context, dist in self._estimators.items() for event in dist) return sorted(ngram_probs, key=itemgetter(1))
def train_model_get_cosine_matrix(statements): statements = [statement.split() for statement in statements] frequencies = FreqDist(w for word in statements for w in word) conditionalFrequencies = ConditionalFreqDist( (key,word) for key in sorted(frequencies.keys()) for statement in statements for word in statement if key in statement) pmi = [[npmi_scorer(frequencies[worda], frequencies[wordb], conditionalFrequencies[worda][wordb], len(frequencies.keys()), 2, sum(frequencies[key] for key in frequencies.keys())) for wordb in sorted(frequencies.keys())] for worda in sorted(frequencies.keys())] pmi = np.array(pmi) pmi[np.isinf(pmi)] = -1 pmi[np.where(pmi < 0)] = 0 pmi = pd.DataFrame(pmi) pmi.columns = sorted(frequencies.keys()) pmi.index = sorted(frequencies.keys()) return pmi
def test_increment(self): # make sure that we can still mutate cfd normally text = "cow cat mouse cat tiger" cfd = ConditionalFreqDist() # create cfd with word length as condition for word in tokenize.word_tokenize(text): condition = len(word) cfd[condition][word] += 1 self.assertEqual(cfd.conditions(), [3,5]) # incrementing previously unseen key is still possible cfd[2]['hi'] += 1 self.assertEqual(set(cfd.conditions()),set([3,5,2])) # new condition added self.assertEqual(cfd[2]['hi'], 1) # key's frequency incremented from 0 (unseen) to 1
def init_prob_unit(): # initialize uniform prob distribution to t(e|f) print("Initializing Uniform Prob distribution") N = len(de_inp) if N != len(en_inp): print("number of lines in src and target don't match!") ten_de = CondFDist() for num in range(N): for de_word in de_inp[num].split(): for en_word in en_inp[num].split(): ten_de[de_word].inc(en_word) # make probs uniform for de_word in ten_de.conditions(): for key in ten_de[de_word].keys(): ten_de[de_word][key] = 1.0 / len(ten_de[de_word]) # print(ten_de[de_word][key]) return ten_de
def train(self): """ This trains a simple baseline which just uses majority class voting for every word in vocabulary disregarding of its context """ self.word_pos_cfd = ConditionalFreqDist( tp for seq_list in self.corpus.train for tp in seq_list.get_tag_word_tuples())
def find_language(string): text=string.split(" ") text=[word for word in text if word.isalpha()] l=len(text) avail_langs=[file for file in udhr.fileids() if 'Latin1' in file] cfd=ConditionalFreqDist([(lang, word) for lang in avail_langs for word in [word for word in text if word in udhr.words(lang)]]) ls=sorted([(lang,cfd[lang]) for lang in avail_langs], key=lambda tple: tple[1].N()) print("The most probable language of the text is {0} with {1:3.3f}% probability.".format(ls[-1][0].replace('-Latin1',''), 100*ls[-1][1].N()/l))
def suf_tag_freq(self): cfd = ConditionalFreqDist() for w in set(self.wt_freq.keys()) - set(self.c_words): for t in self.wt_freq[w].keys(): for suf_len in xrange(1, max(self.max_suf_len, len(w))): suf = w[-suf_len:] cfd[suf].inc(t, self.wt_freq[w][t]) cfd[''].inc(t) return cfd
def language_model(collection): from nltk import ConditionalProbDist from nltk import ConditionalFreqDist from nltk import bigrams from nltk import MLEProbDist words = tokenize_collection(collection) freq_model = ConditionalFreqDist(bigrams(words)) prob_model = ConditionalProbDist(freq_model, MLEProbDist) return prob_model
def train(self): """ Construct the conditional frequencies and probabilities """ #extract tags from sentences tags = [tag for (_, tag) in self.tagged_sents] self.replaceUnique() self.emission_frequencies = ConditionalFreqDist( [tup[::-1] for tup in self.tagged_sents]) self.tagset_size = len(self.emission_frequencies.conditions()) # emission - probability that a certain tag is a certain word # e.g. probability that a VB is 'race' self.emission_probabilities = ConditionalProbDist( self.emission_frequencies, MLEProbDist) self.transition_frequencies = ConditionalFreqDist(bigrams(tags)) self.transition_probabilities = ConditionalProbDist( self.transition_frequencies, MLEProbDist) self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents)
def get_lookup_tagger_accuracy(test_set, lookup_tagger_basis, corpus): words = [word for sent in lookup_tagger_basis for word in sent] fd = FreqDist(words) cfd = ConditionalFreqDist(corpus.tagged_words()) most_freq_words = fd.most_common(200) likely_tags = dict( (word[0], cfd[word[0]].max()) for (word, _) in most_freq_words) baseline_tagger = UnigramTagger(model=likely_tags) result = baseline_tagger.evaluate(test_set) return result
def __get_conditional_freq_dist(self): t = trange( len(self.__ngram), desc= f'Creating Conditional frequency distributions for {len(self.__ngram[0])}-gram' ) condition_pairs = [] for i in t: words = self.__ngram[i] condition_pairs.append((tuple(words[:-1]), words[-1])) return ConditionalFreqDist(condition_pairs)
def nltk_test_3(): # For each token, count current word given previous word. # Create distribution object. # cfd = ConditionalFreqDist() # for word in word_tokenize(sent): # condition = len(word) # cfd[condition][word] += 1 cfd = ConditionalFreqDist((len(word), word) for word in gutenberg.words('austen-persuasion.txt')) # Start predicting at the given word, say ’therefore’ word = 'therefore' i = 1 print cfd.N() print cfd.conditions() # Find all words that can possibly follow the current word and choose one at random while i <= 20: print word, lwords = cfd[word] follower = choice(lwords) word = follower i += 1
def display(): import pylab words_by_freq = FreqDist(brown.words(categories='news')).most_common(2**15) cfd = ConditionalFreqDist(brown.tagged_words(categories='news')) sizes = 2 ** pylab.arange(15) perfs = [performance(cfd, words_by_freq[:size]) for size in sizes] pylab.plot(sizes, perfs, '-bo') pylab.title('Lookup Tagger Performance with Varying Model Size') pylab.xlabel('Model Size') pylab.ylabel('Performance') pylab.show()
def conditional_dist(): cfdist = ConditionalFreqDist() fileids = corpus.gutenberg.fileids() for id in fileids: condition = id filteredText = freq_dist_filter(corpus.gutenberg.words(condition)) for word in filteredText: if word not in cfdist[condition]: cfdist[condition][word] = 0 cfdist[condition][word] += 1 return cfdist
def visualize_monthly_cfd(cfd, figuretitle, ylabel, imgoutpath): cfd_reverse = ConditionalFreqDist((entity, month) for month in cfd.conditions() for entity in list(cfd[month])) months = cfd.conditions() months.sort() barlabels = cfd_reverse.conditions() #print months print barlabels yitemsmatrix = [] for entity in barlabels: row = [] for month in months: row.append(cfd[month][entity]) yitemsmatrix.append(row) if len(barlabels) == 0 or len(yitemsmatrix) == 0: return yitemsmatrix = np.array(yitemsmatrix) #yitemsmatrix = yitemsmatrix.T print yitemsmatrix.shape colors = plotter.get_n_colors(len(barlabels)) months = map(lambda x : str(x), months) # partition the figure in case x axis gets too large by the number of months numofxitems = 5 numoffigures = (len(months) / numofxitems ) + 1 for i in range(numoffigures): matrix = yitemsmatrix[:, (i*numofxitems) : ((i+1)*numofxitems)] print matrix xlabels = months[(i*numofxitems) : ((i+1)*numofxitems)] # save fig. pass img path with i figurename = figuretitle + " "+ str(i) cfdplotter.multiplebargraphs(barlabels, matrix.tolist(), colors, figurename, xlabels, ylabel, imgpath=imgoutpath)
def __init__(self): """Initializes the del_probs and ins_probs variables to empty MLE probability distributions, and the sub_probs to an empty conditional probability distribution.""" self.del_probs = MLEProbDist( FreqDist() ) # a MLE probability distribution representing how likely each character is to be deleted self.ins_probs = MLEProbDist( FreqDist() ) # a MLE probability distribution representing how likely each character is to be inserted self.sub_probs = ConditionalProbDist( ConditionalFreqDist(), MLEProbDist ) # a Conditional Probability Distribution representing how likely a given character is to be replaced by another character
def learn(self, A): total_y = float(len(A)) self.cls_fd = cls_fd = FreqDist() self.feature_fd = feature_fd = FreqDist() pairs = [] for x, y in A: cls_fd.inc(y) for feature in set(get_words(x)): pairs.append((y, feature)) feature_fd.inc(feature) cfd = ConditionalFreqDist(pairs) if DEBUG: print cfd print cfd.conditions() #cfd.tabulate(samples=['gbs', 'build', 'spec', 'repo', 'config']) cfd.tabulate() for author in cfd.conditions(): print 'AUTHOR:', author for word, count in cfd[author].items(): print '%5d %20s' % (count, word) self.voc = voc = feature_fd.keys() self.cls_feature_prob = cls_feature_prob = {} self.cls_and_feature_prob = cls_and_feature_prob = {} for cls, total in cls_fd.items(): fd = cfd[cls] cls_feature_prob[cls] = wc = {} for word in voc: if word in fd: cls_feature_prob[(cls, word)] = float(fd[word]) / total cls_and_feature_prob[(cls, word)] = float(fd[word]) / total_y else: cls_feature_prob[(cls, word)] = 1. / total cls_and_feature_prob[(cls, word)] = 1. / total_y self.feature_prob = feature_prob = {} for word, count in feature_fd.items(): feature_prob[word] = count / total_y
#!/usr/bin/env python from nltk.corpus import brown from nltk import FreqDist, ConditionalFreqDist fd = FreqDist() cfd = ConditionalFreqDist() # for each tagged sentence in the corpus, get the (token, tag) pair and update # both count(tag) and count(tag given token) for sentence in brown.tagged_sents(): for (token, tag) in sentence: fd[tag] += 1 cfd[token][tag] += 1 # The most frequent tag is ... print(fd.max()) # Initialize a list to hold (numtags,word) tuple wordbins = [] # Append each (n(unique tags for token),token) tuple to list for token in cfd.conditions(): wordbins.append((cfd[token].B(), token)) # Sort tuples by number of unique tags (highest first) wordbins.sort(reverse=True) # The token with max. no. of tags is ... print(wordbins[0]) # masculine pronouns
from nltk.corpus import brown from nltk import ConditionalFreqDist as CondFreqDist categories = brown.categories() words = ["likely" , "perhaps" , "probably" , "maybe" ] words = ["female" , "male" , "gentleman" , "lady" , "boy" , "girl"] cfd = CondFreqDist([(cat , word) for cat in categories\ for word in brown.words(categories = cat)]) cfd.tabulate(conditions = categories , samples = words)
#!/usr/bin/python #coding=utf-8 from nltk import ConditionalFreqDist from nltk.corpus import brown words = brown.tagged_words(tagset = 'universal') # 哪个词的不同词性标记数目最多? maximumTagNumber = 0 result = '' cfd = ConditionalFreqDist((word.lower(), tag) for (word, tag) in words) for word in cfd.conditions(): if len(cfd[word]) > maximumTagNumber: maximumTagNumber = len(cfd[word]) result = word + ' (' + ', '.join(tag for (tag, _) in cfd[word].most_common()) + ')' elif len(cfd[word]) == maximumTagNumber: result += '\n' + word + ' (' + ', '.join(tag for (tag, _) in cfd[word].most_common()) + ')' print result
def modal_analysis(keyword_list, modals_list): cfd = ConditionalFreqDist(keyword_list, modals_list) return cfd.tabulate(conditions=keyword_list, samples=modals_list)
from nltk.corpus import brown from nltk import ConditionalFreqDist as CondFreqDist cfd = CondFreqDist( [ (genre, word.lower()) for genre in brown.categories() for target in ["romance", "news"] if genre.lower().startswith(target) for word in brown.words(categories=target) ] ) days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "love", "political"] cfd.tabulate(samples=days)
themes = [word for words in df[df.columns[2]] for word in words.split(';') if len(word)] print(len(themes)) # 45648; 可以发现大部分只有 2 个主题 print(set([len(word) for word in themes])) # {1, 2, 3, 4, 5, 6}; 所有主题长度最长为 6 print([word for word in themes if len(word) == 6]) # ['iphone', 'iphone', 'iphone', 'iphone']; 居然是英文的... # 情感关键词 sen_words = [word for words in df[df.columns[3]] for word in words.split(';') if len(word)] print(len(sen_words)) # 45648 print(set([len(word) for word in sen_words])) # {1, 2, 3, 4, 5, 6}; 所有主题长度最长为 6 print([word for word in sen_words if len(word) == 6]) # ['没有物美价廉', '不会心平气和', '不是别出心裁', '不是结实耐用, ...]...; 这样玩没朋友啊... # 情感值, 正负面 anls = [word for words in df[df.columns[4]] for word in words.split(';') if len(word)] print(len(anls)) # 45648 ## combine sen_words and anls; 联合情感词和情感值, 找同一个次有不同词性标注的 print(sen_words[:10]) # ['实惠', '快', '也好', '太长', '太贵', '不方便', '差', '无语', '满意', '好'] print(anls[:10]) # ['1', '1', '1', '-1', '-1', '-1', '-1', '-1', '1', '1'] con = ConditionalFreqDist(zip(sen_words, anls)) print(con) # <ConditionalFreqDist with 3032 conditions>; 将相同的 key 合并了 print([condition for condition in con.conditions() if len(con[condition].keys()) > 1]) # ['不容易', '不高']; Shit, 只有两个词有不同的情感值(-1, 0, 1) ## 将 theme, sentiment_word, anls 存 with open('./tmp_dataset/BDCI2017-taiyi/theme.txt', 'w') as f: f.write('\n'.join(themes)) with open('./tmp_dataset/BDCI2017-taiyi/word.txt', 'w') as f: f.write('\n'.join(sen_words)) with open('./tmp_dataset/BDCI2017-taiyi/word_score.txt', 'w') as f: f.write('\n'.join(word + ' ' + anls for word, anls in zip(sen_words, anls))) ################################################################## ## 二: 数据预处理; 将 DataFrame 分为 四个 list 分别保存 # df = xlsx.parse("Sheet1") # 因为上面把 NaN 换成了 NUll, 这里重新导入; 后来发现不用了, 使用的时候将 NULL 去掉就行了 contents = [str(word) for word in list(df[df.columns[1]].values)]; print(contents[:10]) themes = [str(word) for word in list(df[df.columns[2]].values)]; print(themes[:10]) words = [str(word) for word in list(df[df.columns[3]].values)]; print(words[:10]) anls = [str(word) for word in list(df[df.columns[4]].values)]; print(anls[:10]) print('len of contents:', len(contents)) # len of contents: 20000 print('len of words:', len(words)) # len of words: 20000
#!/usr/bin/python # coding: utf-8 # 2013/03/20 from nltk import ConditionalFreqDist cfdist = ConditionalFreqDist(pairs) # pairs で指定されたデータの頻度分布を生成 (条件,事象)のペア cfdist.conditions() # アルファード順にソートされた条件のリスト cfdist['条件'] # 指定された条件の頻度分布 cfdist['条件'][sample] # cfdist.tablate() cfdist.tablate(samples,conditions) cfdist.plot() cfdist.plot(samples,conditions) cfdist1 < cfdist2
from load_data import get_df, select_columns from itertools import combinations from nltk import ConditionalFreqDist # get the data as a dataframe df = get_df(shortname='clean_apx') mask_data_source = df['DataSource'] == 'APX' df_select = df[mask_data_source] # choose subset of columns and cast all values as string df_select = df_select[select_columns].astype(str) # choose a smaller subset of columns to analyse report_columns = select_columns[1:5] # a list of all pairwise combinations combo_count = 2 groupby_columns = list(combinations(report_columns, combo_count)) # create a list of tuples groupby_column = list(groupby_columns[0]) arr = df_select[list(groupby_column)].values pairs = list(tuple(map(tuple, arr))) # and now for the good stuff cfd = ConditionalFreqDist(pairs) conditions = cfd.conditions() import pdb; pdb.set_trace()
#!/usr/bin/python3 # coding: utf-8 import nltk from nltk import ConditionalFreqDist from nltk.corpus import brown from nltk.corpus import names from nltk.corpus import inaugural from nltk.corpus import toolbox from nltk.corpus import udhr ################################################################## ## ConditionalFreqDist 简单应用: 文本情感分析 word = ['实惠', '快', '也好', '快', '也好'] anls = ['1', '1', '1', '-1', '1'] tmp_Con = ConditionalFreqDist(zip(word, anls)) print(tmp_Con) # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了 print(tmp_Con.tabulate()) print(tmp_Con.conditions()) # ['实惠', '快', '也好'] print(tmp_Con['快'].most_common()) # [('1', 1), ('-1', 1)] print(tmp_Con['快'].keys()) # dict_keys(['1', '-1']) print(len(tmp_Con['快'].keys())) # 2; 可以看到每个词语的词性有多少个... print(len(tmp_Con['也好'].keys())) # 1; 重复的已经 set() 化了 print([condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1]) # ['快'] tmp_Con.plot() tmp_Con_1 = ConditionalFreqDist(zip(anls, word)) print(tmp_Con_1.conditions()) # ['实惠', '快', '也好'] ################################################################## ## Brown 语料库 word 归类分析 print(brown.categories()) # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) # 这里的 categories=genre 不能去掉 genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] # 从 brown.categories() 中找的 modals = ['can', 'could', 'may', 'might', 'must', 'will'] # 随机找的几个单词
def find_word_matrices(self, newsidlist, processcontent=True, prepend="content"): dateroots = [] datePOStag = [] titleexclamation = [("newsid", "title_exclamation")] textPOStag = [] textroots = [] textrootsWpostag = [] textliterals = [] print prepend, " processing:" for newsid in newsidlist: print "newsid ",newsid filepath = extractnewsmetadata.newsid_to_filepath(newsid) content, title, date = extractnewsmetadata.get_news_article2(filepath) text = "" if processcontent: text = content else: text = title if "!" in title: titleexclamation.append((newsid, 1)) else: titleexclamation.append((newsid, 0)) words = texter.getwords(text) lemmata = SAKsParser.lemmatize_lexicon(words) for (literal, literalPOS, root, rootPOS) in lemmata: root = texter.cleanword(root) if (len(root) > 0) or (not root.isspace()): #print root, textPOStag.append((newsid, literalPOS)) textroots.append((newsid, root)) textrootsWpostag.append((newsid, root+" Wpostag "+rootPOS)) textliterals.append((newsid, literal+" Wpostag "+literalPOS)) dateroots.append((date, root)) datePOStag.append((date, literalPOS)) cfd_dateroots = ConditionalFreqDist(dateroots) cfd_datepostag = ConditionalFreqDist(datePOStag) cfd_textpostag = ConditionalFreqDist(textPOStag) cfd_textroots = ConditionalFreqDist(textroots) cfd_textrootWpostag = ConditionalFreqDist(textrootsWpostag) cfd_textliterals = ConditionalFreqDist(textliterals) print "some id's", cfd_textroots.conditions() cfd_roottext = ConditionalFreqDist((word, docid) for docid in cfd_textroots.conditions() for word in list(cfd_textroots[docid])) # cfd to csv conditems as cols duzelt: csvpath = os.path.join(self.matrixpath, prepend+"-dateroot.csv") CFDhelpers.cfd_to_matrix(cfd_dateroots, csvpath) csvpath = os.path.join(self.matrixpath, prepend+"-datepostag.csv") CFDhelpers.cfd_to_matrix(cfd_datepostag, csvpath) csvpath = os.path.join(self.matrixpath, prepend+"-postagCOUNT.csv") CFDhelpers.cfd_to_matrix(cfd_textpostag, csvpath) termcountcsvpath = os.path.join(self.matrixpath, prepend+"termCOUNT.csv") CFDhelpers.cfd_to_matrix(cfd_textroots, termcountcsvpath) tfidfcsvpath = os.path.join(self.matrixpath, prepend+"termTFIDF.csv") texter.compute_tfidf_ondisc(termcountcsvpath, tfidfcsvpath) csvpath = os.path.join(self.matrixpath, prepend+"-rootcountindex.csv") CFDhelpers.cfd_to_matrix(cfd_roottext, csvpath) csvpath = os.path.join(self.matrixpath, prepend+"rootWpostagCOUNT.csv") CFDhelpers.cfd_to_matrix(cfd_textrootWpostag, csvpath) csvpath = os.path.join(self.matrixpath, prepend+"literalWpostagCOUNT.csv") CFDhelpers.cfd_to_matrix(cfd_textliterals, csvpath) # diger csv'lerden devam 6 Subat 05:42 uyuyuyuyuyuyu # kalklaklkalklklaklaklkal 15:32 if not processcontent: print "keep exclamation !" IOtools.tocsv_lst(titleexclamation, os.path.join(self.matrixpath, prepend+"-exclamation.csv"))
if token.has_key('tag'): if token['tag'] == None: short_tag = '--' else: short_tag = token['tag'][:2]+token['tag'][-1:] long_tag = token['tag'] tag_types.add(long_tag) if token['lemma']: lemma_pos = token['lemma']+'.'+get_wordnet_pos(token['pos']) lemma_pairs.append((token['lemma'], short_tag)) lemma_long_pairs.append((token['lemma'], long_tag)) tagged_pairs.append((token['textlc'], short_tag)) # Print vocabularies for each tag type for tag_type in tag_types: vocabulary_cfd = ConditionalFreqDist([(lemma, long_tag) for (lemma, long_tag) in lemma_long_pairs if long_tag == tag_type]) print vocabulary_cfd.tabulate() #events_cfd = ConditionalFreqDist(tagged_pairs) # Conditional frequency distribution for (lemma, tag) pairs events_cfd = ConditionalFreqDist(lemma_pairs) unambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) < 2] ambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) > 1] print "Unambiguous Words" print events_cfd.tabulate(conditions=unambiguous_words) print "Ambiguous Words" print events_cfd.tabulate(conditions=ambiguous_words)
from nltk.corpus import names from nltk import ConditionalFreqDist as CondFreqDist g2n = CondFreqDist([(gender, name[0]) for gender in names.fileids() for name in names.words(gender)]) n2g = CondFreqDist([(name[0] , gender) for gender in names.fileids() for name in names.words(gender)]) g2n.plot() n2g.plot()
#!/usr/bin/python #coding:utf-8 import nltk from nltk.corpus import reuters from nlp.discounting import discount from nltk import ConditionalFreqDist as CFreqDist from scipy.sparse import lil_matrix,csr_matrix fids = [reuters.fileids()[0]] docs = [[nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(reuters.raw(fileids=[fid]))] for fid in fids] word_list = sorted(set(word for doc in docs for sent in doc for word in sent)) word_dict = dict((word,i) for i,word in enumerate(word_list)) idx_docs = [[[word_dict[word] for word in sent] for sent in doc]for doc in docs] trigram_docs = [[discount.ngrams2(sent,3) for sent in doc] for doc in idx_docs] tri_fd = CFreqDist(gram for doc in trigram_docs for sent in doc for gram in sent) l = len(word_list) # 最尤推定 float(tri_fd[cond][word])/tri_fd[cond] A = lil_matrix(((l+1)**2,l+1)) for cond in tri_fd.conditions(): n = float(tri_fd[cond].N()) for word,val in tri_fd[cond].items(): A[cond[0]*(l+1)+cond[1],word] = val/ n A = lil_matrix(((l+1)**2,l+1),dtype=int) for cond in tri_fd.conditions(): for word,val in tri_fd[cond].items(): A[cond[0]*(l+1)+cond[1],word] = val
def inspect(self, missed): """ Inspect a testing session, and print data about tag accuracy :param missed: list of tuples of missed tags like: (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context) """ # create a CFD so we can examine a matrix of incorrect vs correct tags # ms[1][1] = tag of a gold_tagged_word # ms[0][1] = tag of an hmm_tagged_word cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed) # initialize a hash to store mistakes by frequency mistakes = {} # print a table showing mistake frequency cfd.tabulate() msg("\n") # loop through mistake frequencies by gold standard tag, i.e., if we are # examining gold-standard 'IN', count what we incorrectly tagged it as conds = cfd.conditions() for g_tag in conds: for hmm_tag in cfd[g_tag].keys(): # how many times did we incorrectly say g_tag was hmm_tag? count = cfd[g_tag][hmm_tag] # add these mistakes to the count if count not in mistakes.keys(): mistakes[count] = [] mistakes[count].append((hmm_tag, g_tag)) # get a list of all mistake types that occurred over a threshold, worst first mistake_counts = set([count for (count, mistake_set) in \ mistakes.iteritems() if count > Tagger.mistake_threshold]) mistake_counts = reversed(sorted(mistake_counts)) # now create a list of mistake types to show the user, i.e., loop # through all types and if they are of a high-frequency type, add to list mistakes_to_halt = [] for count in mistake_counts: mistake_set = mistakes[count] for mistake_tuple in mistake_set: mistakes_to_halt.append(mistake_tuple) msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \ mistake_tuple[1])) msg("\n") # create separators used when outputting missed word contexts sep_big = "---------------------------------------------------\n" sep_small = "\n-----------------------------------------\n" # loop through individual mistakes and, if they match the kind of error # we want to halt for, show the user the mistake as well as the sentence # context for both the gold-standard sentence and the hmm-tagged sentence response = None for missed_set in missed: if response not in ['q','Q']: (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \ gold_tagged_sent) = missed_set should_halt = False # determine whether the current mistake matches a mistake type # we want to halt for for pair in mistakes_to_halt: if hmm_tagged_word[1] == pair[0] and \ gold_tagged_word[1] == pair[1]: should_halt = True if should_halt: msg("%sTagged '%s' with %s when it should have been %s.%s" %\ (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\ gold_tagged_word[1], sep_small)) msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \ gold_tagged_sent]))) msg(sep_small) msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \ hmm_tagged_sent]))) # get user input to decide whether to keep going response = raw_input("\n\nEnter to continue, Q to quit: ")
from nltk.corpus import gutenberg from nltk import ConditionalFreqDist from random import choice #create the distribution object cfd = ConditionalFreqDist() ## for each token count the current word given the previous word prev_word = None for word in gutenberg.words('austen-persuasion.txt'): cfd[prev_word][word] += 1 prev_word = word ## start predicting at given word, say "therefore" word = "therefore" i = 1 ## find all words that can follow the given word and choose one at random while i<20: print word, lwords = cfd.get(word).keys() follower = choice(lwords) word = follower i += 1
Output data: Graph. """ import nltk wnl = nltk.WordNetLemmatizer() from nltk.corpus import PlaintextCorpusReader from nltk import ConditionalFreqDist corpus = PlaintextCorpusReader('C:/Data/Candidate_tweets/Processing_tweets/By_week_tweets/Cleaned_by_week/', '.*') corpus.fileids()[0:3] print len(corpus.words()) cfd = ConditionalFreqDist( (target, fileid) for fileid in corpus.fileids() for w in corpus.words(fileid) for target in ['obama', 'romney', 'opponent'] if w==target) cfd.plot() cfd = nltk.ConditionalFreqDist( (target, fileid) for fileid in corpus.fileids() for w in corpus.words(fileid) for target in ['democrat', 'republican', 'independent'] if w==target) cfd.plot()
def pos_percentages(words, tag='NN'): cfd = ConditionalFreqDist((tag,1) for word,tag in tagger.tag(words)) relevant_tags = filter(lambda c: re.match(tag,c), cfd.conditions()) sum_tags = sum([ cfd[c].N() for c in relevant_tags ]) return float(sum_tags)/float(len(words))