Exemple #1
0
 def test_tabulate(self):
     empty = ConditionalFreqDist()
     self.assertEqual(empty.conditions(), [])
     with pytest.raises(ValueError):
         empty.tabulate(
             conditions="BUG")  # nonexistent keys shouldn't be added
     self.assertEqual(empty.conditions(), [])
 def test_tabulate(self):
     empty = ConditionalFreqDist()
     self.assertEqual(empty.conditions(),[])
     try:
         empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added
     except:
         pass
     self.assertEqual(empty.conditions(), [])
    def calculate_vector_spaces(self,k=16):
        cfd = ConditionalFreqDist(
                   (word, doc['document'])
                   for doc in self.mongo[CORPUS_CLN].find()
                   for word in self.interestingWords(doc['document']))
        cfd.tabulate()

        # matrix dimensions
        terms = [c for c in cfd.conditions()] # conditions = words
        docs  = sorted(set(v for c in cfd.conditions() for v in cfd[c]))
        self.log("terms: %s"%str(terms))
        self.log("docs: %s"%str(docs))
        term_by_doc_mat = np.zeros(shape=(len(terms),len(docs)))
        self.log("Term-by-ref-document matrix shape is: %d X %d"%(len(terms),len(docs)))
        for i, term in enumerate(terms):
            li = np.array([cfd[term][doc] for doc in docs])
            term_by_doc_mat[i] = li
        self.log("Matrix\n%s"%str(term_by_doc_mat))

        # perform singular value decomposition
        u,sigma,vh = self._do_svd(term_by_doc_mat,k) 
        del term_by_doc_mat # don't need the matrix anymore

        # map terms to svd space
        terms_space = np.zeros(shape=(len(terms),k))
        for i in xrange(len(terms)):
            vals = [u[i][j] * sigma[j] for j in range(k)] # x-coord = row i, column 1
            terms_space[i] = np.array(vals)

        # map docs to svd space
        docs_space = np.zeros(shape=(len(docs),k))
        for i in xrange(len(docs)):
            vals = [ vh[i][j] * sigma[j] for j in range(k)]
            docs_space[i] = np.array(vals)

        # store matrix data
        row = self.mongo['data'].find_one()
        if not row:
            row = {'terms': terms, 
                   'documents':docs,
                   'terms_subspace':terms_space.tolist(),
                   'docs_subspace':docs_space.tolist(),
                   'u':u.tolist(),
                   'sigma':sigma.tolist(),
                   'vh':vh.tolist(),
                   'date':datetime.utcnow()}
        else:
            row['terms'] = terms
            row['documents'] = docs
            row['terms_subspace'] = terms_space.tolist()
            row['docs_subspace'] = docs_space.tolist()
            row['u'] = u.tolist()
            row['sigma'] = sigma.tolist()
            row['vh'] = vh.tolist()
            row['date'] = datetime.utcnow()

        self.mongo['data'].save(row)
        self.log("Saved matrix data")
Exemple #4
0
    def tabulateWordsInAllGeners(self, theWords):
        """
		find the distribution of a word within all Brown corpus genres
		@params theWord: the word/list of words to find info about
		"""
        cdf = ConditionalFreqDist((genre, word)
                                  for genre in brown.categories()
                                  for word in brown.words(categories=genre))
        cdf.tabulate(samples=theWords, conditions=brown.categories())
Exemple #5
0
    def tabulateWordsInPeriods(self, theWords):
        """
		find the distribution of words within the years, based in Inaugural corpus
		@params theWords: the word/list of words to find info about
		"""
        cdf = ConditionalFreqDist((textid[:4], target)
                                  for textid in inaugural.fileids()
                                  for word in inaugural.words(textid)
                                  for target in theWords
                                  if word.lower().startswith(target)
                                  or word.lower().endswith(target))
        cdf.tabulate()
Exemple #6
0
    def learn(self, A):
        total_y = float(len(A))
        self.cls_fd = cls_fd = FreqDist()
        self.feature_fd = feature_fd = FreqDist()
        pairs = []
        for x, y in A:
            cls_fd.inc(y)
            for feature in set(get_words(x)):
                pairs.append((y, feature))
                feature_fd.inc(feature)
        cfd = ConditionalFreqDist(pairs)

        if DEBUG:
            print cfd
            print cfd.conditions()
            #cfd.tabulate(samples=['gbs', 'build', 'spec', 'repo', 'config'])
            cfd.tabulate()
            for author in cfd.conditions():
                print 'AUTHOR:', author
                for word, count in cfd[author].items():
                    print '%5d %20s' % (count, word)

        self.voc = voc = feature_fd.keys()

        self.cls_feature_prob = cls_feature_prob = {}
        self.cls_and_feature_prob = cls_and_feature_prob = {}
        for cls, total in cls_fd.items():
            fd = cfd[cls]

            cls_feature_prob[cls] = wc = {}
            for word in voc:
                if word in fd:
                    cls_feature_prob[(cls, word)] = float(fd[word]) / total
                    cls_and_feature_prob[(cls, word)] = float(fd[word]) / total_y
                else:
                    cls_feature_prob[(cls, word)] = 1. / total
                    cls_and_feature_prob[(cls, word)] = 1. / total_y

        self.feature_prob = feature_prob = {}
        for word, count in feature_fd.items():
            feature_prob[word] = count / total_y
Exemple #7
0
def modal_analysis(keyword_list, modals_list):
    cfd = ConditionalFreqDist(keyword_list, modals_list)
    return cfd.tabulate(conditions=keyword_list, samples=modals_list)
Exemple #8
0
from nltk.corpus import brown
from nltk import ConditionalFreqDist as CondFreqDist

categories = brown.categories()
words = ["likely" , "perhaps" , "probably" , "maybe" ]
words = ["female" , "male" , "gentleman" , "lady" , "boy" , "girl"]
cfd = CondFreqDist([(cat , word) for cat in categories\
					for word in brown.words(categories = cat)])
cfd.tabulate(conditions = categories , samples = words)

Exemple #9
0
def frequency_table(words):
	cfd = ConditionalFreqDist([(genre,word)
		for genre in brown.categories()
		for word in brown.words(categories = genre)])
	genres = ['news','religion','hobbies','science_fiction','romance','humor']
	cfd.tabulate(conditions=genres,samples=words)
Exemple #10
0
    
from nltk.corpus import brown
from nltk import FreqDist
brown.categories()
news_text=brown.words(categories="news")
gov=brown.words(categories='government')
fdist=FreqDist([w.lower() for w in news_text])
fdist_gov=FreqDist([w.lower() for w in gov])
modals=["can","could", "may", "might","must", "will"]
for m in modals:
    print(m+': '+str(fdist_gov[m]/fdist_gov[modals[0]])+" "+str(fdist[m]/fdist_gov[modals[0]]))
from nltk import ConditionalFreqDist
cfd=ConditionalFreqDist((genre, word) for genre in brown.categories()
                        for word in [word.lower() for word in brown.words(categories=genre)])
days=['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
cfd.tabulate(conditions=['news', 'romance'], samples=days)
cfd.
sent="In the beginning God created the heaven and the earth".split(sep=" ")+["."]
from nltk import bigrams
list(bigrams(sent))
#random text generator
import random
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word, end=' ')
        l1=list(cfdist[word].keys())
        l2=list(cfdist[word].values())
        temp=[]
        for i in range(len(l1)):
            temp=temp+[l1[i]]*l2[i]
        word=random.choice(temp)
Exemple #11
0
    word for word in [word.lower() for word in text]
    if word not in stopwords.words("english")
]
bigrams = list(nltk.bigrams(words))
freqdist = sorted(FreqDist(bigrams).items(), key=itemgetter(1), reverse=True)

#4 -----------------------------------------------------

confreqdist = ConditionalFreqDist((genre, word)
                                  for genre in brown.categories()
                                  for word in brown.words(categories=genre))
words = [
    "mountain", "monster", "river", "eat", "run", "keys", "paper", "joke",
    "war"
]
confreqdist.tabulate(samples=words)

#5 -----------------------------------------------------


def freqOfWord(word, genre):
    fd = FreqDist(brown.words(categories=genre))
    print(word, 'in', genre, ':', fd[word])


freqOfWord

for genre in brown.categories():
    s = 0
    for type in freqdist[genre]:
        s += freqdist[genre][type]
Exemple #12
0
from nltk import ConditionalFreqDist
from nltk.corpus import brown
import matplotlib
words = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said']
CondFreqDist = ConditionalFreqDist(
    (categorie, word) for categorie in brown.categories()
    for word in brown.words(categories=categorie))
CondFreqDist.tabulate(samples=words)
Exemple #13
0
    def inspect(self, missed):
        """
        Inspect a testing session, and print data about tag accuracy
        
        :param missed: list of tuples of missed tags like:
            (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context)
        """

        # create a CFD so we can examine a matrix of incorrect vs correct tags
        # ms[1][1] = tag of a gold_tagged_word
        # ms[0][1] = tag of an hmm_tagged_word
        cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed)

        # initialize a hash to store mistakes by frequency
        mistakes = {}

        # print a table showing mistake frequency
        cfd.tabulate()
        msg("\n")

        # loop through mistake frequencies by gold standard tag, i.e., if we are
        # examining gold-standard 'IN', count what we incorrectly tagged it as
        conds = cfd.conditions()
        for g_tag in conds:
            for hmm_tag in cfd[g_tag].keys():
                # how many times did we incorrectly say g_tag was hmm_tag?
                count = cfd[g_tag][hmm_tag]

                # add these mistakes to the count
                if count not in mistakes.keys():
                    mistakes[count] = []
                mistakes[count].append((hmm_tag, g_tag))

        # get a list of all mistake types that occurred over a threshold, worst first
        mistake_counts = set([count for (count, mistake_set) in \
            mistakes.iteritems() if count > Tagger.mistake_threshold])
        mistake_counts = reversed(sorted(mistake_counts))

        # now create a list of mistake types to show the user, i.e., loop
        # through all types and if they are of a high-frequency type, add to list
        mistakes_to_halt = []
        for count in mistake_counts:
            mistake_set = mistakes[count]
            for mistake_tuple in mistake_set:
                mistakes_to_halt.append(mistake_tuple)
                msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \
                    mistake_tuple[1]))
        msg("\n")

        # create separators used when outputting missed word contexts
        sep_big = "---------------------------------------------------\n"
        sep_small = "\n-----------------------------------------\n"

        # loop through individual mistakes and, if they match the kind of error
        # we want to halt for, show the user the mistake as well as the sentence
        # context for both the gold-standard sentence and the hmm-tagged sentence
        response = None
        for missed_set in missed:
            if response not in ['q', 'Q']:
                (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \
                    gold_tagged_sent) = missed_set
                should_halt = False
                # determine whether the current mistake matches a mistake type
                # we want to halt for
                for pair in mistakes_to_halt:
                    if hmm_tagged_word[1] == pair[0] and \
                        gold_tagged_word[1] == pair[1]:
                        should_halt = True
                if should_halt:
                    msg("%sTagged '%s' with %s when it should have been %s.%s" %\
                    (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\
                        gold_tagged_word[1], sep_small))

                    msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                        gold_tagged_sent])))
                    msg(sep_small)
                    msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                        hmm_tagged_sent])))

                    # get user input to decide whether to keep going
                    response = raw_input("\n\nEnter to continue, Q to quit: ")
#!/usr/bin/python3
# coding: utf-8
import nltk
from nltk import ConditionalFreqDist
from nltk.corpus import brown
from nltk.corpus import names
from nltk.corpus import inaugural
from nltk.corpus import toolbox
from nltk.corpus import udhr
##################################################################
## ConditionalFreqDist 简单应用: 文本情感分析
word = ['实惠', '快', '也好', '快', '也好']
anls = ['1', '1', '1', '-1', '1']
tmp_Con = ConditionalFreqDist(zip(word, anls))
print(tmp_Con)  # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了
print(tmp_Con.tabulate())
print(tmp_Con.conditions())  # ['实惠', '快', '也好']
print(tmp_Con['快'].most_common())  # [('1', 1), ('-1', 1)]
print(tmp_Con['快'].keys())  # dict_keys(['1', '-1'])
print(len(tmp_Con['快'].keys()))  # 2; 可以看到每个词语的词性有多少个...
print(len(tmp_Con['也好'].keys()))  # 1; 重复的已经 set() 化了
print([condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1])  # ['快']
tmp_Con.plot()
tmp_Con_1 = ConditionalFreqDist(zip(anls, word))
print(tmp_Con_1.conditions())  # ['实惠', '快', '也好']
##################################################################
## Brown 语料库 word 归类分析
print(brown.categories())  # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))  # 这里的 categories=genre 不能去掉
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']  # 从 brown.categories() 中找的
modals = ['can', 'could', 'may', 'might', 'must', 'will']  # 随机找的几个单词
Exemple #15
0
from nltk.corpus import brown

#Introduction to Brown Corpus
print(brown.categories())

#Accessing words to Brown Corpus

print(brown.words(categories='lore'))

#Introduction to Conditional Frequency Distribution
from nltk import ConditionalFreqDist  #imports statement

# pair_list [ (condition, word) ]
pair_list = [(category, word) for category in brown.categories()
             for word in brown.words(categories=category)]

print(pair_list[:10])

freqdist = ConditionalFreqDist(pair_list)
print(freqdist['lore']['the'])

#Conditional Method

#tabulate functions

category = ['adventure', 'lore', 'news']
samples = ['the', 'and', 'man']
freqdist.tabulate(conditions=category, samples=samples)
Exemple #16
0
#!/usr/bin/python3
# coding: utf-8
import nltk
from nltk import ConditionalFreqDist
from nltk.corpus import brown
from nltk.corpus import names
from nltk.corpus import inaugural
from nltk.corpus import toolbox
from nltk.corpus import udhr
##################################################################
## ConditionalFreqDist 简单应用: 文本情感分析
word = ['实惠', '快', '也好', '快', '也好']
anls = ['1', '1', '1', '-1', '1']
tmp_Con = ConditionalFreqDist(zip(word, anls))
print(tmp_Con)  # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了
print(tmp_Con.tabulate())
print(tmp_Con.conditions())  # ['实惠', '快', '也好']
print(tmp_Con['快'].most_common())  # [('1', 1), ('-1', 1)]
print(tmp_Con['快'].keys())  # dict_keys(['1', '-1'])
print(len(tmp_Con['快'].keys()))  # 2; 可以看到每个词语的词性有多少个...
print(len(tmp_Con['也好'].keys()))  # 1; 重复的已经 set() 化了
print([
    condition for condition in tmp_Con.conditions()
    if len(tmp_Con[condition].keys()) > 1
])  # ['快']
tmp_Con.plot()
tmp_Con_1 = ConditionalFreqDist(zip(anls, word))
print(tmp_Con_1.conditions())  # ['实惠', '快', '也好']
##################################################################
## Brown 语料库 word 归类分析
print(
Exemple #17
0
    clean_data = []
    for i in data:
        clean_data.append(i.translate(translator))

    return clean_data


def preprocess_text(sequences):
    # Preprocessed : iterator on sequence
    preprocessed = [pad_both_ends(s.split(' '), n=2) for s in sequences]
    return list(flatten(preprocessed))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("file",
                        default=sys.stdin,
                        help='corpus text file to analyse')

    args = parser.parse_args()

    sequences = get_text(args.file)

    tokens = preprocess_text(sequences)

    fd = FreqDist(tokens)
    model = bigrams(tokens)
    cfd = ConditionalFreqDist(model)

    print(cfd.tabulate())
         if token['tag'] == None:
             short_tag = '--'
         else:
             short_tag = token['tag'][:2]+token['tag'][-1:]
         long_tag = token['tag']
     tag_types.add(long_tag)
     if token['lemma']:
         lemma_pos = token['lemma']+'.'+get_wordnet_pos(token['pos'])
         lemma_pairs.append((token['lemma'], short_tag))
         lemma_long_pairs.append((token['lemma'], long_tag))
     tagged_pairs.append((token['textlc'], short_tag))
 
 # Print vocabularies for each tag type
 for tag_type in tag_types:
     vocabulary_cfd = ConditionalFreqDist([(lemma, long_tag) for (lemma, long_tag) in lemma_long_pairs if long_tag == tag_type])
     print vocabulary_cfd.tabulate()
 
 #events_cfd = ConditionalFreqDist(tagged_pairs)
 # Conditional frequency distribution for (lemma, tag) pairs
 events_cfd = ConditionalFreqDist(lemma_pairs)
 
 unambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) < 2]
 
 ambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) > 1]
 
 print "Unambiguous Words"
 print events_cfd.tabulate(conditions=unambiguous_words)
 
 print "Ambiguous Words"
 print events_cfd.tabulate(conditions=ambiguous_words)
 
Exemple #19
0
def modal_analysis(keyword_list, modals_list):
    cfd = ConditionalFreqDist(keyword_list, modals_list)
    return cfd.tabulate(conditions=keyword_list, samples=modals_list)
Exemple #20
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 20 01:01:54 2018

@author: vpapg
"""

# Download some text from a language that has vowel harmony (e.g. Hungarian), extract the vowel sequences of words, and create a vowel bigram table.

from nltk.corpus import brown
from nltk import ConditionalFreqDist, Index
import re

romance = brown.words(categories='romance')

vowel_seqs_list = [(vowels_set, w) for w in romance for vowels_set in re.findall(r'[aeiou][aeiou]', w)]
vowel_seqs_index = Index(vowel_seqs_list)
#print(vowel_seqs_index['aa'])

l=[]
for element in vowel_seqs_list:
    l.append(element[0])

cfd = ConditionalFreqDist(l)
cfd.tabulate()
from nltk.corpus import brown

# Introduction to the Brown Corpus

print(brown.categories())

# Accessing words of Brown Corpus

print(brown.words(categories='adventure'))

# Introduction to Conditional Frequency Distribution

from nltk import ConditionalFreqDist

pair_list = [(genre, word) for genre in brown.categories()
             for word in brown.words(categories=genre)]

cfd = ConditionalFreqDist(pair_list)
genres = ['news', 'romance', 'religion', 'humor']
modals = ['it', 'could', 'may', 'might', 'must']

cfd.tabulate(conditions=genres, samples=modals)

# conditions method

print(cfd.conditions())

print(cfd['romance']['could'])
Exemple #22
0
 def inspect(self, missed):
     """
     Inspect a testing session, and print data about tag accuracy
     
     :param missed: list of tuples of missed tags like:
         (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context)
     """
     
     # create a CFD so we can examine a matrix of incorrect vs correct tags
     # ms[1][1] = tag of a gold_tagged_word
     # ms[0][1] = tag of an hmm_tagged_word
     cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed)
     
     # initialize a hash to store mistakes by frequency
     mistakes = {}
     
     # print a table showing mistake frequency
     cfd.tabulate()
     msg("\n")
     
     # loop through mistake frequencies by gold standard tag, i.e., if we are
     # examining gold-standard 'IN', count what we incorrectly tagged it as
     conds = cfd.conditions()
     for g_tag in conds:
         for hmm_tag in cfd[g_tag].keys():
             # how many times did we incorrectly say g_tag was hmm_tag?
             count = cfd[g_tag][hmm_tag]
             
             # add these mistakes to the count
             if count not in mistakes.keys():
                 mistakes[count] = []
             mistakes[count].append((hmm_tag, g_tag))
             
     # get a list of all mistake types that occurred over a threshold, worst first
     mistake_counts = set([count for (count, mistake_set) in \
         mistakes.iteritems() if count > Tagger.mistake_threshold])
     mistake_counts = reversed(sorted(mistake_counts))
     
     # now create a list of mistake types to show the user, i.e., loop 
     # through all types and if they are of a high-frequency type, add to list
     mistakes_to_halt = []
     for count in mistake_counts:
         mistake_set = mistakes[count]
         for mistake_tuple in mistake_set:
             mistakes_to_halt.append(mistake_tuple)
             msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \
                 mistake_tuple[1]))
     msg("\n")
     
     # create separators used when outputting missed word contexts
     sep_big = "---------------------------------------------------\n"
     sep_small = "\n-----------------------------------------\n"
     
     # loop through individual mistakes and, if they match the kind of error
     # we want to halt for, show the user the mistake as well as the sentence
     # context for both the gold-standard sentence and the hmm-tagged sentence
     response = None
     for missed_set in missed:
         if response not in ['q','Q']:
             (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \
                 gold_tagged_sent) = missed_set
             should_halt = False
             # determine whether the current mistake matches a mistake type
             # we want to halt for
             for pair in mistakes_to_halt:
                 if hmm_tagged_word[1] == pair[0] and \
                     gold_tagged_word[1] == pair[1]:
                     should_halt = True
             if should_halt:
                 msg("%sTagged '%s' with %s when it should have been %s.%s" %\
                 (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\
                     gold_tagged_word[1], sep_small))
                 
                 msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                     gold_tagged_sent])))
                 msg(sep_small)
                 msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                     hmm_tagged_sent])))
                 
                 # get user input to decide whether to keep going
                 response = raw_input("\n\nEnter to continue, Q to quit: ")
from nltk.corpus import brown
from nltk import ConditionalFreqDist as CondFreqDist

cfd = CondFreqDist(
    [
        (genre, word.lower())
        for genre in brown.categories()
        for target in ["romance", "news"]
        if genre.lower().startswith(target)
        for word in brown.words(categories=target)
    ]
)
days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "love", "political"]
cfd.tabulate(samples=days)