def test_tabulate(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(), []) with pytest.raises(ValueError): empty.tabulate( conditions="BUG") # nonexistent keys shouldn't be added self.assertEqual(empty.conditions(), [])
def test_tabulate(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(),[]) try: empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added except: pass self.assertEqual(empty.conditions(), [])
def calculate_vector_spaces(self,k=16): cfd = ConditionalFreqDist( (word, doc['document']) for doc in self.mongo[CORPUS_CLN].find() for word in self.interestingWords(doc['document'])) cfd.tabulate() # matrix dimensions terms = [c for c in cfd.conditions()] # conditions = words docs = sorted(set(v for c in cfd.conditions() for v in cfd[c])) self.log("terms: %s"%str(terms)) self.log("docs: %s"%str(docs)) term_by_doc_mat = np.zeros(shape=(len(terms),len(docs))) self.log("Term-by-ref-document matrix shape is: %d X %d"%(len(terms),len(docs))) for i, term in enumerate(terms): li = np.array([cfd[term][doc] for doc in docs]) term_by_doc_mat[i] = li self.log("Matrix\n%s"%str(term_by_doc_mat)) # perform singular value decomposition u,sigma,vh = self._do_svd(term_by_doc_mat,k) del term_by_doc_mat # don't need the matrix anymore # map terms to svd space terms_space = np.zeros(shape=(len(terms),k)) for i in xrange(len(terms)): vals = [u[i][j] * sigma[j] for j in range(k)] # x-coord = row i, column 1 terms_space[i] = np.array(vals) # map docs to svd space docs_space = np.zeros(shape=(len(docs),k)) for i in xrange(len(docs)): vals = [ vh[i][j] * sigma[j] for j in range(k)] docs_space[i] = np.array(vals) # store matrix data row = self.mongo['data'].find_one() if not row: row = {'terms': terms, 'documents':docs, 'terms_subspace':terms_space.tolist(), 'docs_subspace':docs_space.tolist(), 'u':u.tolist(), 'sigma':sigma.tolist(), 'vh':vh.tolist(), 'date':datetime.utcnow()} else: row['terms'] = terms row['documents'] = docs row['terms_subspace'] = terms_space.tolist() row['docs_subspace'] = docs_space.tolist() row['u'] = u.tolist() row['sigma'] = sigma.tolist() row['vh'] = vh.tolist() row['date'] = datetime.utcnow() self.mongo['data'].save(row) self.log("Saved matrix data")
def tabulateWordsInAllGeners(self, theWords): """ find the distribution of a word within all Brown corpus genres @params theWord: the word/list of words to find info about """ cdf = ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) cdf.tabulate(samples=theWords, conditions=brown.categories())
def tabulateWordsInPeriods(self, theWords): """ find the distribution of words within the years, based in Inaugural corpus @params theWords: the word/list of words to find info about """ cdf = ConditionalFreqDist((textid[:4], target) for textid in inaugural.fileids() for word in inaugural.words(textid) for target in theWords if word.lower().startswith(target) or word.lower().endswith(target)) cdf.tabulate()
def learn(self, A): total_y = float(len(A)) self.cls_fd = cls_fd = FreqDist() self.feature_fd = feature_fd = FreqDist() pairs = [] for x, y in A: cls_fd.inc(y) for feature in set(get_words(x)): pairs.append((y, feature)) feature_fd.inc(feature) cfd = ConditionalFreqDist(pairs) if DEBUG: print cfd print cfd.conditions() #cfd.tabulate(samples=['gbs', 'build', 'spec', 'repo', 'config']) cfd.tabulate() for author in cfd.conditions(): print 'AUTHOR:', author for word, count in cfd[author].items(): print '%5d %20s' % (count, word) self.voc = voc = feature_fd.keys() self.cls_feature_prob = cls_feature_prob = {} self.cls_and_feature_prob = cls_and_feature_prob = {} for cls, total in cls_fd.items(): fd = cfd[cls] cls_feature_prob[cls] = wc = {} for word in voc: if word in fd: cls_feature_prob[(cls, word)] = float(fd[word]) / total cls_and_feature_prob[(cls, word)] = float(fd[word]) / total_y else: cls_feature_prob[(cls, word)] = 1. / total cls_and_feature_prob[(cls, word)] = 1. / total_y self.feature_prob = feature_prob = {} for word, count in feature_fd.items(): feature_prob[word] = count / total_y
def modal_analysis(keyword_list, modals_list): cfd = ConditionalFreqDist(keyword_list, modals_list) return cfd.tabulate(conditions=keyword_list, samples=modals_list)
from nltk.corpus import brown from nltk import ConditionalFreqDist as CondFreqDist categories = brown.categories() words = ["likely" , "perhaps" , "probably" , "maybe" ] words = ["female" , "male" , "gentleman" , "lady" , "boy" , "girl"] cfd = CondFreqDist([(cat , word) for cat in categories\ for word in brown.words(categories = cat)]) cfd.tabulate(conditions = categories , samples = words)
def frequency_table(words): cfd = ConditionalFreqDist([(genre,word) for genre in brown.categories() for word in brown.words(categories = genre)]) genres = ['news','religion','hobbies','science_fiction','romance','humor'] cfd.tabulate(conditions=genres,samples=words)
from nltk.corpus import brown from nltk import FreqDist brown.categories() news_text=brown.words(categories="news") gov=brown.words(categories='government') fdist=FreqDist([w.lower() for w in news_text]) fdist_gov=FreqDist([w.lower() for w in gov]) modals=["can","could", "may", "might","must", "will"] for m in modals: print(m+': '+str(fdist_gov[m]/fdist_gov[modals[0]])+" "+str(fdist[m]/fdist_gov[modals[0]])) from nltk import ConditionalFreqDist cfd=ConditionalFreqDist((genre, word) for genre in brown.categories() for word in [word.lower() for word in brown.words(categories=genre)]) days=['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] cfd.tabulate(conditions=['news', 'romance'], samples=days) cfd. sent="In the beginning God created the heaven and the earth".split(sep=" ")+["."] from nltk import bigrams list(bigrams(sent)) #random text generator import random def generate_model(cfdist, word, num=15): for i in range(num): print(word, end=' ') l1=list(cfdist[word].keys()) l2=list(cfdist[word].values()) temp=[] for i in range(len(l1)): temp=temp+[l1[i]]*l2[i] word=random.choice(temp)
word for word in [word.lower() for word in text] if word not in stopwords.words("english") ] bigrams = list(nltk.bigrams(words)) freqdist = sorted(FreqDist(bigrams).items(), key=itemgetter(1), reverse=True) #4 ----------------------------------------------------- confreqdist = ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) words = [ "mountain", "monster", "river", "eat", "run", "keys", "paper", "joke", "war" ] confreqdist.tabulate(samples=words) #5 ----------------------------------------------------- def freqOfWord(word, genre): fd = FreqDist(brown.words(categories=genre)) print(word, 'in', genre, ':', fd[word]) freqOfWord for genre in brown.categories(): s = 0 for type in freqdist[genre]: s += freqdist[genre][type]
from nltk import ConditionalFreqDist from nltk.corpus import brown import matplotlib words = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said'] CondFreqDist = ConditionalFreqDist( (categorie, word) for categorie in brown.categories() for word in brown.words(categories=categorie)) CondFreqDist.tabulate(samples=words)
def inspect(self, missed): """ Inspect a testing session, and print data about tag accuracy :param missed: list of tuples of missed tags like: (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context) """ # create a CFD so we can examine a matrix of incorrect vs correct tags # ms[1][1] = tag of a gold_tagged_word # ms[0][1] = tag of an hmm_tagged_word cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed) # initialize a hash to store mistakes by frequency mistakes = {} # print a table showing mistake frequency cfd.tabulate() msg("\n") # loop through mistake frequencies by gold standard tag, i.e., if we are # examining gold-standard 'IN', count what we incorrectly tagged it as conds = cfd.conditions() for g_tag in conds: for hmm_tag in cfd[g_tag].keys(): # how many times did we incorrectly say g_tag was hmm_tag? count = cfd[g_tag][hmm_tag] # add these mistakes to the count if count not in mistakes.keys(): mistakes[count] = [] mistakes[count].append((hmm_tag, g_tag)) # get a list of all mistake types that occurred over a threshold, worst first mistake_counts = set([count for (count, mistake_set) in \ mistakes.iteritems() if count > Tagger.mistake_threshold]) mistake_counts = reversed(sorted(mistake_counts)) # now create a list of mistake types to show the user, i.e., loop # through all types and if they are of a high-frequency type, add to list mistakes_to_halt = [] for count in mistake_counts: mistake_set = mistakes[count] for mistake_tuple in mistake_set: mistakes_to_halt.append(mistake_tuple) msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \ mistake_tuple[1])) msg("\n") # create separators used when outputting missed word contexts sep_big = "---------------------------------------------------\n" sep_small = "\n-----------------------------------------\n" # loop through individual mistakes and, if they match the kind of error # we want to halt for, show the user the mistake as well as the sentence # context for both the gold-standard sentence and the hmm-tagged sentence response = None for missed_set in missed: if response not in ['q', 'Q']: (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \ gold_tagged_sent) = missed_set should_halt = False # determine whether the current mistake matches a mistake type # we want to halt for for pair in mistakes_to_halt: if hmm_tagged_word[1] == pair[0] and \ gold_tagged_word[1] == pair[1]: should_halt = True if should_halt: msg("%sTagged '%s' with %s when it should have been %s.%s" %\ (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\ gold_tagged_word[1], sep_small)) msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \ gold_tagged_sent]))) msg(sep_small) msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \ hmm_tagged_sent]))) # get user input to decide whether to keep going response = raw_input("\n\nEnter to continue, Q to quit: ")
#!/usr/bin/python3 # coding: utf-8 import nltk from nltk import ConditionalFreqDist from nltk.corpus import brown from nltk.corpus import names from nltk.corpus import inaugural from nltk.corpus import toolbox from nltk.corpus import udhr ################################################################## ## ConditionalFreqDist 简单应用: 文本情感分析 word = ['实惠', '快', '也好', '快', '也好'] anls = ['1', '1', '1', '-1', '1'] tmp_Con = ConditionalFreqDist(zip(word, anls)) print(tmp_Con) # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了 print(tmp_Con.tabulate()) print(tmp_Con.conditions()) # ['实惠', '快', '也好'] print(tmp_Con['快'].most_common()) # [('1', 1), ('-1', 1)] print(tmp_Con['快'].keys()) # dict_keys(['1', '-1']) print(len(tmp_Con['快'].keys())) # 2; 可以看到每个词语的词性有多少个... print(len(tmp_Con['也好'].keys())) # 1; 重复的已经 set() 化了 print([condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1]) # ['快'] tmp_Con.plot() tmp_Con_1 = ConditionalFreqDist(zip(anls, word)) print(tmp_Con_1.conditions()) # ['实惠', '快', '也好'] ################################################################## ## Brown 语料库 word 归类分析 print(brown.categories()) # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) # 这里的 categories=genre 不能去掉 genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] # 从 brown.categories() 中找的 modals = ['can', 'could', 'may', 'might', 'must', 'will'] # 随机找的几个单词
from nltk.corpus import brown #Introduction to Brown Corpus print(brown.categories()) #Accessing words to Brown Corpus print(brown.words(categories='lore')) #Introduction to Conditional Frequency Distribution from nltk import ConditionalFreqDist #imports statement # pair_list [ (condition, word) ] pair_list = [(category, word) for category in brown.categories() for word in brown.words(categories=category)] print(pair_list[:10]) freqdist = ConditionalFreqDist(pair_list) print(freqdist['lore']['the']) #Conditional Method #tabulate functions category = ['adventure', 'lore', 'news'] samples = ['the', 'and', 'man'] freqdist.tabulate(conditions=category, samples=samples)
#!/usr/bin/python3 # coding: utf-8 import nltk from nltk import ConditionalFreqDist from nltk.corpus import brown from nltk.corpus import names from nltk.corpus import inaugural from nltk.corpus import toolbox from nltk.corpus import udhr ################################################################## ## ConditionalFreqDist 简单应用: 文本情感分析 word = ['实惠', '快', '也好', '快', '也好'] anls = ['1', '1', '1', '-1', '1'] tmp_Con = ConditionalFreqDist(zip(word, anls)) print(tmp_Con) # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了 print(tmp_Con.tabulate()) print(tmp_Con.conditions()) # ['实惠', '快', '也好'] print(tmp_Con['快'].most_common()) # [('1', 1), ('-1', 1)] print(tmp_Con['快'].keys()) # dict_keys(['1', '-1']) print(len(tmp_Con['快'].keys())) # 2; 可以看到每个词语的词性有多少个... print(len(tmp_Con['也好'].keys())) # 1; 重复的已经 set() 化了 print([ condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1 ]) # ['快'] tmp_Con.plot() tmp_Con_1 = ConditionalFreqDist(zip(anls, word)) print(tmp_Con_1.conditions()) # ['实惠', '快', '也好'] ################################################################## ## Brown 语料库 word 归类分析 print(
clean_data = [] for i in data: clean_data.append(i.translate(translator)) return clean_data def preprocess_text(sequences): # Preprocessed : iterator on sequence preprocessed = [pad_both_ends(s.split(' '), n=2) for s in sequences] return list(flatten(preprocessed)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("file", default=sys.stdin, help='corpus text file to analyse') args = parser.parse_args() sequences = get_text(args.file) tokens = preprocess_text(sequences) fd = FreqDist(tokens) model = bigrams(tokens) cfd = ConditionalFreqDist(model) print(cfd.tabulate())
if token['tag'] == None: short_tag = '--' else: short_tag = token['tag'][:2]+token['tag'][-1:] long_tag = token['tag'] tag_types.add(long_tag) if token['lemma']: lemma_pos = token['lemma']+'.'+get_wordnet_pos(token['pos']) lemma_pairs.append((token['lemma'], short_tag)) lemma_long_pairs.append((token['lemma'], long_tag)) tagged_pairs.append((token['textlc'], short_tag)) # Print vocabularies for each tag type for tag_type in tag_types: vocabulary_cfd = ConditionalFreqDist([(lemma, long_tag) for (lemma, long_tag) in lemma_long_pairs if long_tag == tag_type]) print vocabulary_cfd.tabulate() #events_cfd = ConditionalFreqDist(tagged_pairs) # Conditional frequency distribution for (lemma, tag) pairs events_cfd = ConditionalFreqDist(lemma_pairs) unambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) < 2] ambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) > 1] print "Unambiguous Words" print events_cfd.tabulate(conditions=unambiguous_words) print "Ambiguous Words" print events_cfd.tabulate(conditions=ambiguous_words)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Sep 20 01:01:54 2018 @author: vpapg """ # Download some text from a language that has vowel harmony (e.g. Hungarian), extract the vowel sequences of words, and create a vowel bigram table. from nltk.corpus import brown from nltk import ConditionalFreqDist, Index import re romance = brown.words(categories='romance') vowel_seqs_list = [(vowels_set, w) for w in romance for vowels_set in re.findall(r'[aeiou][aeiou]', w)] vowel_seqs_index = Index(vowel_seqs_list) #print(vowel_seqs_index['aa']) l=[] for element in vowel_seqs_list: l.append(element[0]) cfd = ConditionalFreqDist(l) cfd.tabulate()
from nltk.corpus import brown # Introduction to the Brown Corpus print(brown.categories()) # Accessing words of Brown Corpus print(brown.words(categories='adventure')) # Introduction to Conditional Frequency Distribution from nltk import ConditionalFreqDist pair_list = [(genre, word) for genre in brown.categories() for word in brown.words(categories=genre)] cfd = ConditionalFreqDist(pair_list) genres = ['news', 'romance', 'religion', 'humor'] modals = ['it', 'could', 'may', 'might', 'must'] cfd.tabulate(conditions=genres, samples=modals) # conditions method print(cfd.conditions()) print(cfd['romance']['could'])
def inspect(self, missed): """ Inspect a testing session, and print data about tag accuracy :param missed: list of tuples of missed tags like: (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context) """ # create a CFD so we can examine a matrix of incorrect vs correct tags # ms[1][1] = tag of a gold_tagged_word # ms[0][1] = tag of an hmm_tagged_word cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed) # initialize a hash to store mistakes by frequency mistakes = {} # print a table showing mistake frequency cfd.tabulate() msg("\n") # loop through mistake frequencies by gold standard tag, i.e., if we are # examining gold-standard 'IN', count what we incorrectly tagged it as conds = cfd.conditions() for g_tag in conds: for hmm_tag in cfd[g_tag].keys(): # how many times did we incorrectly say g_tag was hmm_tag? count = cfd[g_tag][hmm_tag] # add these mistakes to the count if count not in mistakes.keys(): mistakes[count] = [] mistakes[count].append((hmm_tag, g_tag)) # get a list of all mistake types that occurred over a threshold, worst first mistake_counts = set([count for (count, mistake_set) in \ mistakes.iteritems() if count > Tagger.mistake_threshold]) mistake_counts = reversed(sorted(mistake_counts)) # now create a list of mistake types to show the user, i.e., loop # through all types and if they are of a high-frequency type, add to list mistakes_to_halt = [] for count in mistake_counts: mistake_set = mistakes[count] for mistake_tuple in mistake_set: mistakes_to_halt.append(mistake_tuple) msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \ mistake_tuple[1])) msg("\n") # create separators used when outputting missed word contexts sep_big = "---------------------------------------------------\n" sep_small = "\n-----------------------------------------\n" # loop through individual mistakes and, if they match the kind of error # we want to halt for, show the user the mistake as well as the sentence # context for both the gold-standard sentence and the hmm-tagged sentence response = None for missed_set in missed: if response not in ['q','Q']: (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \ gold_tagged_sent) = missed_set should_halt = False # determine whether the current mistake matches a mistake type # we want to halt for for pair in mistakes_to_halt: if hmm_tagged_word[1] == pair[0] and \ gold_tagged_word[1] == pair[1]: should_halt = True if should_halt: msg("%sTagged '%s' with %s when it should have been %s.%s" %\ (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\ gold_tagged_word[1], sep_small)) msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \ gold_tagged_sent]))) msg(sep_small) msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \ hmm_tagged_sent]))) # get user input to decide whether to keep going response = raw_input("\n\nEnter to continue, Q to quit: ")
from nltk.corpus import brown from nltk import ConditionalFreqDist as CondFreqDist cfd = CondFreqDist( [ (genre, word.lower()) for genre in brown.categories() for target in ["romance", "news"] if genre.lower().startswith(target) for word in brown.words(categories=target) ] ) days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "love", "political"] cfd.tabulate(samples=days)