def mostprobableparse(self, sent, sample=None): """warning: this problem is NP-complete. using an unsorted chart parser avoids unnecessary sorting (since we need all derivations anyway). @param sent: a sequence of terminals @param sample: None or int; if int then sample that many parses""" p = FreqDist() for a in self.parser.nbest_parse(sent, sample): p.inc(removeids(a).freeze(), a.prob()) if p.max(): return ProbabilisticTree(p.max().node, p.max(), prob=p[p.max()]) else: raise ValueError("no parse")
def plot_freq(productions): prod_fd = FreqDist(productions) prod_to_dist = [prod_fd[key] for key in prod_fd] dist_fd = FreqDist(prod_to_dist) X_vec = list(range(prod_fd[prod_fd.max()]))[1:] Y_vec = [dist_fd[x] for x in X_vec] py.plot(X_vec, Y_vec)
def max_dist(emoList): x = {} for e in emoList: fd = FreqDist(emoList[e]) m = fd.max() x[m] = fd.freq(m) return max(x, key=lambda k: x[k])
def select(self, key): '''' select(key) mimics the db.fetch method from crusher.py The name select comes from the sql sytax for processing select queries ''' selection = [] checksumList = [] # Get data for the supplied key for i in range(1, VERSIONS): keyToSelect = keyForDb(key[0], i, key[1]) try: selection.append(self.db.fetch(keyToSelect)) except KeyError: selection.append("DOES_NOT_EXIST") # Get checksum for the supplied key for i in range(1, VERSIONS): if(key[1][0] == "o"): keyForChecksum = keyForDb(key[0], i, "om") if(key[1][0] == "c"): keyForChecksum = keyForDb(key[0], i, "cm") if(key[1][0] == "t"): keyForChecksum = keyForDb(key[0], i, "tm") try: checksumList.append(self.db.fetch(keyForChecksum)) except KeyError: checksumList.append("CHECKSUM_DOES_NOT_EXIST") except UnboundLocalError: pass # Voting using NLTK's FreqDist module. freqSelection = FreqDist(selection) mostCommonFromSelection = freqSelection.max() freqChecksum = FreqDist(checksumList) mostCommonFromChecksum = freqChecksum.max() # Compare checksum if(self.__CompareChecksumWithSelection__(mostCommonFromSelection, mostCommonFromChecksum) == True): return mostCommonFromSelection else: # Raise checksum error is it does not match. # TO_DO: Again try more selections and checksum comparision return mostCommonFromSelection
def choose_tag(self, tokens, index, history): word = tokens[index] fd = FreqDist() for synset in wordnet.synsets(word): fd[synset.pos()] += 1 if fd: return self.wordnet_tag_map.get(fd.max()) else: return None
def handle(self, *args, **options): fdist = FreqDist() print "Analyzing raw data" limit = 10 if args: raw_datas = RawData.objects.filter(pk__in=args) else: raw_datas = RawData.objects.all()[:limit] tagged_data = [] for raw_data in raw_datas: words = nltk.word_tokenize(raw_data.data) tagged_data.extend(nltk.pos_tag(words)) for word in words: word = word.strip() if word: fdist.inc(word) print "Anaylzed %s items" % len(raw_datas) print print "Top word: %s" % fdist.max() print print "Top 10 words" for word in fdist.keys()[:10]: times = fdist[word] print " -- %s occurred %s times" % (word, times) print print "Bottom 10 words" for word in fdist.keys()[-10:]: times = fdist[word] print " -- %s occurred %s times" % (word, times) print print "Words occurring between 50-100 times" words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ] print ", ".join(words) cfdist = ConditionalFreqDist() for (word, tag) in tagged_data: cfdist[tag].inc(word) print "Most popular noun: %s" % cfdist["NN"].max() print print "Top 50 nouns" for word in cfdist["NN"].keys()[:50]: times = cfdist["NN"][word] print " -- %s occurred %s times" % (word, times) print
def task2a(data): tags = [] for key in data.keys(): for sentence in data[key]: for _, tag in sentence: tags.append(tag) fd = FreqDist(tags) most_frequent_tag = fd.max() print("Most frequent tag: {}".format(most_frequent_tag)) default_tagger = DefaultTagger(most_frequent_tag) test_tagger(default_tagger, data) return tag
def mm(path): rules = {} current_sents = get_sents(path) cpt = 0 while (len(current_sents) != 0): grams = extract_ngrams(current_sents) #get the n_grams fd = FreqDist(grams) r = fd.max() #r has the most frequente gram rules["NT" + str(cpt)] = r current_sents = rplc(current_sents, r, "NT" + str(cpt)) #replace the gram by her rule name cpt += 1 return rules
def run (path): rules = {} current_sents = func.get_tagged_sents(path) current_sents =func.compress_tags(current_sents) print("there's " + str(len(current_sents)) + " sentence(s)") text="" for s in current_sents : if(" "in s): text = text+s+"\n" cpt=0 while True: current_sents = text.split("\n") grams = extract_ngrams(current_sents)#get the n_grams fd = FreqDist(grams) max_freq_gram = fd.max()#r has the most frequente gram if(fd[max_freq_gram]==1): break r="" #the string of the grame to replace for g in max_freq_gram: r = r + g + " " #exp ("at","nn") => r="at nn " r=r.strip() print(f"{cpt} => {r}") rules["NT"+str(cpt)]=r text=text.replace(" " + r + " "," NT"+str(cpt)+" ")#replace the gram by her rule name text=text.replace(" "+r+"\n"," NT"+str(cpt)+"\n") text=text.replace("\n"+r+" ","\nNT"+str(cpt)+" ") text=text.replace("\n"+r+"\n","\nNT"+str(cpt)+"\n") cpt+=1 #all the gram do repeat one time only #till here its correct the rest its not working at all sentences = text.split("\n") sentences = sort_len(sentences) for i in range(0,len(sentences)): r=sentences[i] b = False # the rule is not a sub_rule if(text.count(r+" ")+text.count(r+"\n")>1):b=True # it is a sub rule if(len(sentences[i].split())>1): rules["NT"+str(cpt)]=r text=text.replace(" "+r+" "," NT"+str(cpt)+" ")#replace the gram by her rule name text=text.replace(" "+r+"\n"," NT"+str(cpt)+"\n") text=text.replace("\n"+r+" ","\nNT"+str(cpt)+" ") text=text.replace("\n"+r+"\n","\nNT"+str(cpt)+"\n") if(b==False):rule_base.append("NT"+str(cpt)) cpt=cpt+1 sentences = text.split("\n") sentences = sort_len(sentences) else: if(b==False):rule_base.append(r) return rules
def main(): """ a basic REPL for testing """ corpus = """(S (NP John) (VP (V likes) (NP Mary))) (S (NP Peter) (VP (V hates) (NP Susan))) (S (NP Harry) (VP (V eats) (NP pizza))) (S (NP Hermione) (VP (V eats)))""".splitlines() corpus = """(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (JJ hungry) (NN dog)))) (S (NP (DT The) (JJ little) (NN mouse)) (VP (VBP ate) (NP (DT the) (NN cat))))""".splitlines() #corpus = """(S (NP mary) (VP walks) (AP quickly))""".splitlines() #(S (NP Harry) (VP (V likes) (NP Susan) (ADVP (RB very) (RB much)))) corpus = [Tree(a) for a in corpus] #d = GoodmanDOP(corpus, rootsymbol='S') from bitpar import BitParChartParser d = GoodmanDOP(corpus, rootsymbol='TOP', wrap='TOP', parser=BitParChartParser) #d = GoodmanDOP(corpus, rootsymbol='TOP', wrap='TOP') #print d.grammar print "corpus" for a in corpus: print a w = "foo!" while w: print "sentence:", w = raw_input().split() try: p = FreqDist() for n, a in enumerate(d.parser.nbest_parse(w)): if n > 1000: break print a p.inc(ImmutableTree.convert(removeids(a)), a.prob()) #for b, a in sorted((b,a) for (a,b) in p.items()): # print a, b print print 'best', p.max(), p[p.max()] #print d.parse(w) except Exception: # as e: print "error", #e
def extract_doc_feats(refactorized_documents): from nltk import FreqDist from collections import defaultdict import itertools import math import pdb import numpy doc_num = len(refactorized_documents) occurences = defaultdict(lambda: 0) for doc in refactorized_documents: for x in set(doc): occurences[x] += 1 ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents)) glob_freqs = FreqDist(ref_docs_flat) tokens = glob_freqs.samples() glob_features = [{}]*doc_num for i in range(0, doc_num): doc_features = [0]*len(tokens) doc_freqs = FreqDist(refactorized_documents[i]) doc_len = len(refactorized_documents[i]) for (tok,num) in doc_freqs.items(): max_doc_freq = doc_freqs.freq(doc_freqs.max())*float(doc_len) # augmented #tf = 0.5 + (0.5*float(num)) / float(max_doc_freq) tf = 1+math.log(num,10) idf = math.log( float(doc_num) / (float(occurences[tok])) ,10) tfidf = tf*idf indx = tokens.index(tok) doc_features[indx] = tfidf f_tmp = numpy.asarray(doc_features) f_tmp = f_tmp/(numpy.linalg.norm(f_tmp)+numpy.finfo(float).eps) glob_features[i] = f_tmp.tolist() glob_features = numpy.asarray(glob_features)*glob_freqs.N() print "Glob Freqs:", glob_freqs.N() return (glob_features,tokens)
def textChanged_inputTextEdit(self): inputText = self.inputTextEdit.toPlainText().strip() inputText = ''.join(c for c in inputText if not ud.category(c).startswith('P') ) # Delete all ponctuations (Arabic included) inputTokens = functions.tokenization(inputText) freqDist = FreqDist(inputTokens) self.numWordEdit.setText(str(freqDist.N())) self.mostFreqWordEdit.setText(freqDist.max()) numSentences = len( functions.tok_stem(self.inputTextEdit.toPlainText(), False)) self.numSentenceEdit.setText(str(numSentences)) self.inStatsGroup.setEnabled( True if self.inputTextEdit.toPlainText().strip() else False) self.searchWordGroup.setEnabled( True if self.inputTextEdit.toPlainText().strip() else False) self.startPosTagButton.setEnabled( True if self.inputTextEdit.toPlainText().strip() else False)
word_len = [len(w) for w in text1] print word_len # Example Description # fdist = FreqDist(samples) create a frequency distribution containing the given samples # fdist[sample] += 1 increment the count for this sample # fdist['monstrous'] count of the number of times a given sample occurred # fdist.freq('monstrous') frequency of a given sample # fdist.N() total number of samples # fdist.most_common(n) the n most common samples and their frequencies # for sample in fdist: iterate over the samples # fdist.max() sample with the greatest count # fdist.tabulate() tabulate the frequency distribution # fdist.plot() graphical plot of the frequency distribution # fdist.plot(cumulative=True) cumulative plot of the frequency distribution # fdist1 |= fdist2 update fdist1 with counts from fdist2 # fdist1 < fdist2 test if samples in fdist1 occur less frequently than in fdist2 fdlist = FreqDist(len(w) for w in text1) print dict(fdlist) print fdlist.most_common(3) print fdlist.max() print fdlist[2] print fdlist.tabulate() fdlist.plot() fdlist.plot(cumulative=True)
sentence_dic_nights[file] = len(corpus_nightsII.sents(file)) sentence_dic_nights = collections.OrderedDict(sentence_dic_nights) # we make sure that the order of the data stays the same # Which night has the most sentences? for file, characters in sentence_list_nights: if characters == max(sentence_dic_nights.values()): print(file, characters) # the Eight Hundred and Forty-fifth.txt => 399 # In the following block of code, we calculate what the average word length is in each night dict_word_length = {} for file in corpus_nightsII.fileids(): text = corpus_nightsII.words(file) x = [len(words) for words in text] fdist = FreqDist(x) dict_word_length[file] = fdist.max() print(dict_word_length) # We now calculate the readability for each file. We do this by using the Automated Readability Index (ARI). stat_list = [] x = word_dic_nights.keys() for name in x: n_char = char_dict_night[name] n_words = word_dic_nights[name] n_sents = sentence_dic_nights[name] stat_list.append((name, n_char, n_words, n_sents)) print(stat_list) def ARI(n_char, n_words, n_sents): x = n_char/n_words
tweet_tokenizer = TweetTokenizer() with open('C:/users/onlyone/desktop/prefeito/prefeito.txt', mode='r', encoding='UTF-8') as dados_tratados: mining1 = str(dados_tratados.readlines()) mining2 = re.sub(caracters, u'', mining1) mining3 = tweet_tokenizer.tokenize(str(mining2)) mining4 = FreqDist(mining3) dados_tratados.close() #Impressão de padrões frequêntes print(mining4) print(mining4) print(mining4.max()) print(mining4.most_common()) #Plotagem Gráfico 1 mining4.plot(60, cumulative=False, title="Gráfico de Padrões Frequêntes - Prefeito de Salvador") #Plotagem núvem de palavras from PIL import Image from nltk import FreqDist import numpy as np from wordcloud import WordCloud import matplotlib.pyplot as plt
def multi_sentence(context_sentences, ambiguous_word): fdist = FreqDist() for sentence in context_sentences: fdist.inc(lesk(sentence, ambiguous_word)) return fdist.max()
def word_frequencies(data): f = FreqDist(data) return f.max()
title = nostop_title_dsc + nostop_title_kd nltk.Text(title).collocations() fdist_title = FreqDist(title) fdist_title.most_common(50) fdist_title.plot(50, cumulative=True) fdist_title.plot(50) total_words = len(set(title)) print("The total number of words in title of dsc is: " + str(total_words)) avg_words = fdist_title.N() / total_words print("Each word appears in title of dsc is: " + str(int(avg_words))) text = nostop_text_dsc + nostop_text_kd nltk.Text(text).collocations() fdist_text = FreqDist(text) fdist_text.most_common(50) fdist_text.max() fdist_text.plot(50, cumulative=True) fdist_text.plot(50) total_textwords = len(set(text)) print("The total number of words in text is: " + str(total_textwords)) avg_text = fdist_text.N() / total_textwords print("Each word appears in text " + str(int(avg_text)) + " times") # bigrams and trigrams word_pair_text = list(bigrams(text)) word_triple_text = list(trigrams(text)) bigrams_text = FreqDist(word_pair_text) trigrams_text = FreqDist(word_triple_text) bigrams_text.most_common(50) bigrams_text.plot(50) bigrams_text.plot(50, cumulative=True)
### What is the most frequent tag? ### Which word has the most number of distinct tags? fd = FreqDist() cfd = ConditionalFreqDist() # for each tagged sentence in the corpus, get the (token, tag) pair and update # both count(tag) and count(tag given token) for sentence in brown.tagged_sents(): for (token, tag) in sentence: fd[tag] += 1 cfd[token][tag] += 1 # Find the most frequent tag fd.max() # Initialize a list to hold (numtags,word) tuple wordbins = [] # Append each tuple (number of unique tags for token, token) to list for token in cfd.conditions(): wordbins.append((cfd[token].B(), token)) # sort tuples by number of unique tags (highest first) wordbins.sort(reverse=True) print wordbins[0] # token with max. no. of tags is ... ### What is the ratio of masculine to feminine pronouns? male = ['he','his','him','himself'] # masculine pronouns
# words_set = set(book_of_genesis) # ignore capitalization and duplicates # words_set = set(word.lower() for word in book_of_genesis) # ignore capitalization, duplicates and non-alphabetic items (numbers and punctuation characters) words_set = set(word.lower() for word in book_of_genesis if word.isalpha()) # number of words len(words_set) # get words longer than 10 minimum_characters = 10 long_words = [word for word in words_set if len(word) > minimum_characters] sorted(long_words) # sorted alphabetically (capital letters first) # get words longer than 7 that occur more than 7 times minimum_characters = 7 minimum_frequency = 7 fdist = FreqDist(book_of_genesis) frequent_long_words = [word for word in words_set if len(word) > minimum_characters and fdist[word] > minimum_frequency] sorted(frequent_long_words) # sorted alphabetically (capital letters first) # frequency of words based on their length words_length = [len(word) for word in book_of_genesis] fdist = FreqDist(words_length) fdist.most_common() fdist.max() # most frequent word length fdist.freq(3) # frequency of words whose length is 3
return [w for w in word if w not in stopwords.words('english') and w != ''] # lemma def lemma(text): lmtzr = WordNetLemmatizer() return [lmtzr.lemmatize(w) for w in text] nostop_title = lemma(remove_stopwords(text_title)) # check the collocations of text nostop_title = nltk.Text(nostop_title) nostop_title.collocations() fdist_title = FreqDist(nostop_title) # Frequency distribution of text fdist_title.most_common(50) # most common 50 fdist_title['science'] # return count of a given word fdist_title.max() # max counts fdist_title.plot(50, cumulative=True) # plot fdist_title.plot(50) fdist_title.tabulate(50) # tabulate total_words = len(set(nostop_title)) print("The total number of words in title of dsc is: " + str(total_words)) avg_words = fdist_title.N() / total_words print("Each word appears in title of dsc is: " + str(int(avg_words))) # bigrams, trigrams from nltk import bigrams from nltk import trigrams word_pair = list(bigrams(nostop_title)) word_triple = list(trigrams(nostop_title)) bigrams_title = FreqDist(word_pair) trigrams_title = FreqDist(word_triple)
class Document(object): def __init__(self, doc_id): #rename metadata something more general? self.metadata = { "doc_title": None, "author_lastname": None, "author_first_middle": None, "year_written": None, "year_published": None, "pub_title": None, "pub_type": None, "Type-Token Ratio": None, "Hapax Dislegomena": None, "Honore's R": None, "Yule's K": None, "tokenized_doc": []} self.doc_id = doc_id self.fdist = None self.frequencies = [] self.metadata_getter() self.tokenized_doc_getter() self.thrk_getter() self.frequency_dist_getter() #method? #self.timestamp() def timestamp(self): ts = time.time() return datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d_%H%M%S_') def metadata_getter(self): # move to object? cursor = db.cursor() c = cursor.execute('SELECT author_lastname, author_first_middle, doc_title, original_publication_title, original_publication_type, year_written, year_published FROM metadata WHERE doc_id = (?)', (self.doc_id,)) for row in c: self.metadata["author_lastname"] = row[0] self.metadata["author_first_middle"] = row[1] self.metadata["doc_title"] = row[2] self.metadata["pub_title"] = row[3] self.metadata["pub_type"] = row[4] self.metadata["year_written"] = row[5] self.metadata["year_published"] = row[6] #print "Metadata Found for Doc ", (self.doc_id) def tokenized_doc_getter(self): #assumes we're connected to db doc_name = 'document_' + str(self.doc_id) cursor = db.execute('SELECT * FROM {}'.format(doc_name,)) text = [] for i in cursor: text.append(str(i[0])) self.metadata["tokenized_doc"] = text #print "Tokenized Document ", (self.doc_id) def type_token_ratio(self): self.metadata["Type-Token Ratio"] = float(self.V / self.N) def hap_dis_ratio(self): self.metadata["Hapax Dislegomena"] = float(self.hapaxes[2] / self.V) #assignments can go in methods def honore_r(self): if self.hapaxes[1] != 0: self.metadata["Honore's R"] = float((100*math.log(self.N, 10)) / (1 - (self.hapaxes[1] / self.V))) else: self.metadata["Honore's R"] = 'NA' def yule_k(self): #we find the value of the greatest number of times any word appears summation = [] for i in self.hapaxes: summation.append(float(i**2 * self.hapaxes[i])) #with the summation, find K self.metadata["Yule's K"] = float((10**4 * (sum(summation) - self.N)) / (self.N**2)) def frequency_dist(self): self.fdist = FreqDist(self.metadata["tokenized_doc"]) def frequency_dist_getter(self): if self.fdist == None: self.frequency_dist() self.frequencies = self.fdist.items() def hapaxes_summation(self): self.frequency_dist() max = self.fdist[self.fdist.max()] # hapaxes method (only gets called if you hit else here) hapaxes = {} for n in range(1, max+1): hapaxes[n] = 0 for i in self.fdist: hapaxes[self.fdist[i]] += 1 self.hapaxes = hapaxes def thrk_getter(self): cursor = db.cursor() c = cursor.execute('SELECT doc_id, t, h, r, k FROM thrk WHERE doc_id = (?)', (self.doc_id,)) count = 0 for i in c: count +=1 if count > 0: c = cursor.execute('SELECT doc_id, t, h, r, k FROM thrk WHERE doc_id = (?)', (self.doc_id,)) for i in c: self.metadata["Type-Token Ratio"] = i[1] self.metadata["Hapax Dislegomena"] = i[2] self.metadata["Honore's R"] = i[3] self.metadata["Yule's K"] = i[4] else: self.hapaxes_summation() # make these instance variables self.N = float(self.fdist.N()) self.V = float(len(self.fdist)) #Just call these self.type_token_ratio() self.hap_dis_ratio() self.honore_r() self.yule_k() cursor.execute('INSERT INTO thrk (doc_id, t, h, r, k) VALUES (?, ?, ?, ?, ?)', (self.doc_id, self.metadata["Type-Token Ratio"], self.metadata["Hapax Dislegomena"], self.metadata["Honore's R"], self.metadata["Yule's K"])) db.commit()
from nltk import FreqDist from common.books import text1 fdist = FreqDist(len(w) for w in text1()) print(fdist) # print(fdist.keys()) # print(fdist.items()) print(fdist.most_common()) print(fdist.max()) print(fdist[3]) print(fdist.freq(3))
def clicked_startPosTagButton(self): self.inputText = self.inputTextEdit.toPlainText() file = open(self.mainWindow.modelResults + 'Input.txt', 'w', encoding='utf-8') file.write(self.inputText) file.close() tokStems = functions.tok_stem(self.inputTextEdit.toPlainText()) normTokStems = functions.normalization(tokStems, self.mainWindow.modelSources) numberStems = len(normTokStems) numberUNK = 0 text = '' counter = 0 while counter < len(tokStems): text += tokStems[counter] + ' ' if normTokStems[counter] == 'مجه': numberUNK += 1 counter += 1 file = open(self.mainWindow.modelResults + 'Affix.txt', 'w', encoding='utf-8') file.write(text) file.close() stemsTags = functions.viterbi(normTokStems, self.mainWindow.modelSources) text = '' tagsText = '' counter = 0 while counter < len(stemsTags): tag = stemsTags[counter] token = tokStems[counter] tagsText += tag + ' ' text += token + '/' + '<span style="background-color: yellow; font: bold 11px;">' + tag + '</span>' + ' ' counter += 1 file = open(self.mainWindow.modelResults + 'Tag.txt', 'w', encoding='utf-8') file.write(tagsText) file.close() self.parentWindow.posTagTab.taggedTextEdit.setHtml(text) file = open(self.mainWindow.modelResults + 'Out.txt', 'w', encoding='utf-8') file.write(self.parentWindow.posTagTab.taggedTextEdit.toPlainText()) file.close() self.parentWindow.posTagTab.numStemEdit.setText(str(numberStems)) self.parentWindow.posTagTab.numUnkTagsEdit.setText(str(numberUNK)) tagsList = tagsText.split() freqDist = FreqDist(tagsList) self.parentWindow.posTagTab.mostFreqTagEdit.setText(freqDist.max()) self.mainWindow.statusBarLabel.setText( 'Input text has been PoS-Tagged. Check text files in "/model/results/"' ) self.parentWindow.tabWidget.setTabEnabled(1, True)
[len(w) for w in text1] # Collocations are frequent bigrams from words that are not so common as unigrams. # This function returns nothing, just prints the collocations to screen text1.collocations() # Computing the frequency distribution of word lengths. Returns a dictionary. fdistWordLength = FreqDist([len(w) for w in text1]) fdistWordLength.keys() # The different word lengths fdistWordLength.values() # The frequency of each word length fdistWordLength.items() # Shows both keys and values at the same time fdist1['the'] fdist1.freq('the') # Frequency of the word ‘the’ fdist1.max() #### MOVIE REVIEWS #### import nltk from nltk.corpus import movie_reviews movie_reviews.categories() movie_reviews.fileids('pos') movie_reviews.fileids('neg') movie_reviews.words('neg/cv729_10475.txt') len(movie_reviews.words('neg/cv729_10475.txt')) documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories()
# stem of word def stem(word): regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$' stem, suffix = re.findall(regexp, word)[0] return stem def lexical_diversity(text): return len(text) / len(set(text)) nostop_title = lemma(remove_stopwords(text_title)) nltk.Text(nostop_title).collocations() # Frequency distribution of text fdist_title = FreqDist(nostop_title) fdist_title.most_common(50) fdist_title.max() fdist_title.plot(50, cumulative=True)#plot fdist_title.plot(50) total_words = len(set(nostop_title)) print("The total number of words in title of KD is: " + str(total_words)) avg_words = fdist_title.N()/total_words print("Each word appears in title of KD is: " + str(int(avg_words))) # process for text f = open('kdtext.txt', encoding="latin-1") raw_text = f.read() # type type(raw_text) tokens = word_tokenize(raw_text) type(tokens)
#!/usr/bin/python # coding: utf-8 # 2013/03/20 from nltk import FreqDist fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成 fdist.inc(sample) # sampleで指定されたデータの数を1増やす fdist['データ'] # 指定されたデータの出現数 fdist.freq('データ') # 指定されたデータの頻度 fdist.N() # サンプルの総数 fdist.keys() # 頻度の順にソートされたサンプル for sample in fdist: # 頻度の順にサンプルをイテレート pass fdist.max() # 数の最も多いサンプル fdist.tabulate() # 頻度分布を表形式で表示 fdist.plot() # 頻度分布をプロット fdist.plot(cumulative=True) # 累積頻度をプロット fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト
#!/usr/bin/env python from nltk.corpus import brown from nltk import FreqDist, ConditionalFreqDist fd = FreqDist() cfd = ConditionalFreqDist() # for each tagged sentence in the corpus, get the (token, tag) pair and update # both count(tag) and count(tag given token) for sentence in brown.tagged_sents(): for (token, tag) in sentence: fd[tag] += 1 cfd[token][tag] += 1 # The most frequent tag is ... print(fd.max()) # Initialize a list to hold (numtags,word) tuple wordbins = [] # Append each (n(unique tags for token),token) tuple to list for token in cfd.conditions(): wordbins.append((cfd[token].B(), token)) # Sort tuples by number of unique tags (highest first) wordbins.sort(reverse=True) # The token with max. no. of tags is ... print(wordbins[0]) # masculine pronouns
Autor: RodriguesFAS Email: <*****@*****.**> | <*****@*****.**> Website: <http://rodriguesfas.com.br> | <http://clubedosgeeks.com.br> Github: <https://github.com/rodriguesfas> ''' import nltk from nltk import FreqDist import matplotlib.pyplot as plt #import matplotlib text_src = open('corpus.txt').read() ''' nltk.word_tokenize(text) ''' print "=====================[word_tokenize]" tokens = nltk.word_tokenize(text_src) print tokens ''' Retorna a frequência max de uma palavra FreqDist() ''' print "=====================[FreqDist]" frequency_word = FreqDist('are') print frequency_word ''' Retorna a frequência da palavra mais frequente. ''' print "=====================[max]" print frequency_word.max()
#!/usr/bin/python3 # coding: utf-8 import nltk from nltk.corpus import gutenberg # 导入 gutenberg 集 ################################################################## ## FreqDist 跟踪分布中的采样频率 (sample frequencies) from nltk import FreqDist # 导入 FreqDist 类 fd = FreqDist(gutenberg.words('austen-persuasion.txt')) # 频率分布实例化, 统计文本中的 Token print(fd) # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token print(type(fd)) # <class 'nltk.probability.FreqDist'> print(fd['the']) # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典 print(fd.N()) # 98171; 是单词, 不是字母, 有重复的 print(fd.B()) # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中 print(len(fd.keys()), type(fd.keys())) # 6132 <class 'dict_keys'> print(fd.keys()) # fd.B() 只是输出个数, 这个是把所有词汇表输出 print(fd.max()) # 频率最高的一个词 print(fd.freq('the')) # 0.03178127960395636; 出现频率 3120 / 98171 print(fd.hapaxes()) # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词 # 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征 for idx, word in enumerate(fd): # 可以用 enumerate 来遍历, 是按出现顺序排的 if idx == 5: break print(idx, word) # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen ################################################################## ## 统计词的长度频率 fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt')) print(fdist) # <FreqDist with 16 samples and 98171 outcomes> print(fdist.items()) # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)]) print(fdist.most_common(3)) # [(3, 20013), (1, 16274), (2, 16165)] ################################################################## ## 统计 英文字符 fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha()) # 可以不用 [] 将生成器 list 化
def most_frequent_sense_accuracy(self): """ Computes the accuracy of always predicting the overall most frequent sense for all instances in the dataset. """ label_list = [inst.label for inst in self.instance_list] freq_dist = FreqDist(label_list) return freq_dist[freq_dist.max()] / len(label_list)
text1) # bigramsText1[0] is the tuple containing the first bigram # Collocations are frequent bigrams from words that are not so common as unigrams. # This function returns nothing, just prints the collocations to screen text1.collocations() # Computing the frequency distribution of word lengths. Returns a dictionary. fdistWordLength = FreqDist([len(w) for w in text1]) fdistWordLength.keys() # The different word lengths fdistWordLength.values() # The frequency of each word length fdistWordLength.items() # Shows both keys and values at the same time fdist1['the'] fdist1.freq('the') # Frequency of the word ‘the’ fdist1.max() # String methods s = "MatTias" s.lower() s.upper() s.startswith("ma") "T" in s # Find all the words in Moby Dick that ends with -ableness. Sort then alphabetically. from nltk.book import text2, text3, text5, text7
################################################################## ## FreqDist 跟踪分布中的采样频率 (sample frequencies) from nltk import FreqDist # 导入 FreqDist 类 fd = FreqDist( gutenberg.words('austen-persuasion.txt')) # 频率分布实例化, 统计文本中的 Token print( fd ) # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token print(type(fd)) # <class 'nltk.probability.FreqDist'> print(fd['the']) # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典 print(fd.N()) # 98171; 是单词, 不是字母, 有重复的 print(fd.B() ) # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中 print(len(fd.keys()), type(fd.keys())) # 6132 <class 'dict_keys'> print(fd.keys()) # fd.B() 只是输出个数, 这个是把所有词汇表输出 print(fd.max()) # 频率最高的一个词 print(fd.freq('the')) # 0.03178127960395636; 出现频率 3120 / 98171 print(fd.hapaxes()) # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词 # 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征 for idx, word in enumerate(fd): # 可以用 enumerate 来遍历, 是按出现顺序排的 if idx == 5: break print(idx, word) # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen ################################################################## ## 统计词的长度频率 fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt')) print(fdist) # <FreqDist with 16 samples and 98171 outcomes> print( fdist.items() ) # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)]) print(fdist.most_common(3)) # [(3, 20013), (1, 16274), (2, 16165)] ##################################################################