Ejemplo n.º 1
0
 def test_tabulate(self):
     empty = ConditionalFreqDist()
     self.assertEqual(empty.conditions(), [])
     with pytest.raises(ValueError):
         empty.tabulate(
             conditions="BUG")  # nonexistent keys shouldn't be added
     self.assertEqual(empty.conditions(), [])
def visualize_monthly_news_stats2(csvfolder=metacorpus.statspath, csvname=metacorpus.prunedmetafilename,
                                 imgoutpath=metacorpus.imgfolder,
                                 rescatmap=metacorpus.resourcecategorymap2):
    colldf = IOtools.readcsv(csvfolder+os.sep+csvname)
    
    numoftexts, _ = colldf.values.shape
    
    
    # daily news counts for resources
    cfddailyresourcecount = ConditionalFreqDist((colldf.loc[i,"date"], colldf.loc[i,"resource"].strip()) for i in range(numoftexts))
    CFDhelpers.cfd2csv(cfddailyresourcecount, csvfolder+os.sep+"cfddailyresourcecount2.csv", colnames=['date','resource','count'])
    #cfdresourcesdaycount = ConditionalFreqDist((resource, day) for day in cfddailyresourcecount.conditions() for resource in list(cfddailyresourcecount[day]))
    
    
    # daily news counts for categories
    cfddailycategorycount = ConditionalFreqDist((colldf.loc[i,"date"], 
                                                 "_".join(map(lambda x : str(x).strip(), [colldf.loc[i, "resource"], colldf.loc[i, "category"]]))) for i in range(numoftexts)) 
    CFDhelpers.cfd2csv(cfddailycategorycount, csvfolder+os.sep+"cfddailycategorycount2.csv", ["date", "category", 'count'])
    #cfdcatsdaycount = ConditionalFreqDist((category, date) for date in cfddailycategorycount.conditions() for category in list(cfddailycategorycount[date]))

    
    
    # visualize monthly   --- assuming the dates are of the form yyyy-mm-dd -we did it so while recording
    
    cfdmonthlyresourcecount = ConditionalFreqDist((colldf.loc[i,"date"][:-3], colldf.loc[i,"resource"].strip()) for i in range(numoftexts))
    CFDhelpers.cfd2csv(cfdmonthlyresourcecount, csvfolder+os.sep+"cfdmonthlyresourcecount.csv", colnames=['month','resource','count'])
    #cfdresourcesmonthcount = ConditionalFreqDist((resource, month) for month in cfdmonthlyresourcecount.conditions() for resource in list(cfdmonthlyresourcecount[month]))
    imgpath = IOtools.ensure_dir(os.path.join(imgoutpath, "resourcebasednewscount"))
    visualize_monthly_cfd(cfd=cfdmonthlyresourcecount, figuretitle="Monthly news count for each resource", ylabel="news published", imgoutpath=imgpath)



    
    cfdmonthlycategorycount = ConditionalFreqDist((colldf.loc[i,"date"][:-3], 
                                                   "-".join(map(lambda x : str(x).strip(), [colldf.loc[i, "resource"], colldf.loc[i, "category"]]))) 
                                                  for i in range(numoftexts)) 
    CFDhelpers.cfd2csv(cfdmonthlycategorycount, csvfolder+os.sep+"cfdmonthlycategorycount.csv", ["month", "category", 'count'])
    #cfdcatsmonthcount = ConditionalFreqDist((category, month) for month in cfdmonthlycategorycount.conditions() for category in list(cfdmonthlycategorycount[month]))
    
    imgpath = IOtools.ensure_dir(os.path.join(imgoutpath, "categorybasednewscount"))
    for canoniccatname, rescatnamedct in rescatmap.iteritems():
        monthresourcepairs = []
        
        for resourcename, origcats in rescatnamedct.iteritems(): 
        
            for origcatname in origcats:
                #resourcename = rescat.split("-")[0]
                rescat = "-".join([resourcename, origcatname])
                for month in cfdmonthlycategorycount.conditions():
                    numofoccurrences = cfdmonthlycategorycount[month][rescat]
                    #print resourcename," had ",numofoccurrences," times texts in :",rescat," during ",month
                    for i in range(numofoccurrences):
                        monthresourcepairs.append((month, resourcename))
                        
        cfdmonthlyresourcecount_percat = ConditionalFreqDist(monthresourcepairs) 
            
        print canoniccatname,resourcename," * ",rescat," : ",len(cfdmonthlyresourcecount_percat.conditions()),"  ",cfdmonthlyresourcecount_percat.N()
        figuretitle = "Monthly news count of each resource over category "+canoniccatname.upper()
        visualize_monthly_cfd(cfdmonthlyresourcecount_percat, figuretitle, ylabel="news published", imgoutpath=imgpath)
Ejemplo n.º 3
0
 def test_plot(self):
     empty = ConditionalFreqDist()
     self.assertEqual(empty.conditions(), [])
     try:
         empty.plot(conditions="BUG")  # nonexistent keys shouldn't be added
     except:
         pass
     self.assertEqual(empty.conditions(), [])
    def calculate_vector_spaces(self,k=16):
        cfd = ConditionalFreqDist(
                   (word, doc['document'])
                   for doc in self.mongo[CORPUS_CLN].find()
                   for word in self.interestingWords(doc['document']))
        cfd.tabulate()

        # matrix dimensions
        terms = [c for c in cfd.conditions()] # conditions = words
        docs  = sorted(set(v for c in cfd.conditions() for v in cfd[c]))
        self.log("terms: %s"%str(terms))
        self.log("docs: %s"%str(docs))
        term_by_doc_mat = np.zeros(shape=(len(terms),len(docs)))
        self.log("Term-by-ref-document matrix shape is: %d X %d"%(len(terms),len(docs)))
        for i, term in enumerate(terms):
            li = np.array([cfd[term][doc] for doc in docs])
            term_by_doc_mat[i] = li
        self.log("Matrix\n%s"%str(term_by_doc_mat))

        # perform singular value decomposition
        u,sigma,vh = self._do_svd(term_by_doc_mat,k) 
        del term_by_doc_mat # don't need the matrix anymore

        # map terms to svd space
        terms_space = np.zeros(shape=(len(terms),k))
        for i in xrange(len(terms)):
            vals = [u[i][j] * sigma[j] for j in range(k)] # x-coord = row i, column 1
            terms_space[i] = np.array(vals)

        # map docs to svd space
        docs_space = np.zeros(shape=(len(docs),k))
        for i in xrange(len(docs)):
            vals = [ vh[i][j] * sigma[j] for j in range(k)]
            docs_space[i] = np.array(vals)

        # store matrix data
        row = self.mongo['data'].find_one()
        if not row:
            row = {'terms': terms, 
                   'documents':docs,
                   'terms_subspace':terms_space.tolist(),
                   'docs_subspace':docs_space.tolist(),
                   'u':u.tolist(),
                   'sigma':sigma.tolist(),
                   'vh':vh.tolist(),
                   'date':datetime.utcnow()}
        else:
            row['terms'] = terms
            row['documents'] = docs
            row['terms_subspace'] = terms_space.tolist()
            row['docs_subspace'] = docs_space.tolist()
            row['u'] = u.tolist()
            row['sigma'] = sigma.tolist()
            row['vh'] = vh.tolist()
            row['date'] = datetime.utcnow()

        self.mongo['data'].save(row)
        self.log("Saved matrix data")
Ejemplo n.º 5
0
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for category, words in wordsInCategories:
        word_fd.update(words)
        label_word_fd[category].update(words)

    word_counts = {}
    for condition in label_word_fd.conditions():
        word_counts[condition] = label_word_fd[condition].N()

    total_word_count = 0
    for condition, count in word_counts.items():
        total_word_count += count

    word_scores = {}

    for word, freq in word_fd.items():
        score = 0
        for condition, count in word_counts.items():
            score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count)
        word_scores[word] = score

    best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words]
    return set([w for w, s in best])
Ejemplo n.º 6
0
class NgramModel(object):
    """A simple N-gram model."""
    def __init__(self, n, training_data):
        """Create an n order model using training_data."""
        # Set n and train
        self._n = n
        train_ngrams = _make_ngram_tuples(training_data, self._n)
        self._cfd = ConditionalFreqDist(
            (context, event) for (context, event) in train_ngrams)
        self._estimators = dict((context, self._cfd[context])
                                for context in self._cfd.conditions())

    def prob(self, event, context):
        """Return the probability for an event in the provided context"""
        context = tuple(context)
        try:
            return self._estimators[context].freq(event)
        except KeyError:
            return 0.0

    def seqprob(self, seq):
        """Return the probability of a sequence."""
        prob = 1.0
        for context, event in _make_ngram_tuples(seq, self._n):
            prob *= self.prob(event, context)
        return prob

    def allngrams(self):
        """Return all N-grams observed by the model and their probabilities."""
        ngram_probs = ((event, context, self.prob(event, context))
                       for context, dist in self._estimators.items()
                       for event in dist)
        return sorted(ngram_probs, key=itemgetter(1))
Ejemplo n.º 7
0
    def constructTransitionMatrix(self, sourceFilesList: list):
        #construction of the transition matrix
        for fileName in sourceFilesList:
            file = open(fileName, 'r', encoding="windows-1256")
            fileFinal = ""
            for line in file:
                line = line.upper()
                if (len(line) > 1):
                    if not line.startswith("<S>"):
                        fileFinal += '<S> ' + line[:-1] + ' <E>\n'
                    else:
                        fileFinal += line[:-1] + '\n'
            file.close()

        tokens = [el for el in re.split("[\s\n]+", fileFinal) if el != '']
        self.initialProbabilities = FreqDist([
            tokens[i] for i in range(1, len(tokens)) if tokens[i - 1] == '<S>'
        ])

        self.tags = list(set(tokens))
        self.bigramDist = FreqDist(list(bigrams(tokens)))
        Trigrams = list(trigrams(tokens))
        cfd = ConditionalFreqDist(((el[2], (el[0], el[1])) for el in Trigrams))

        for word in cfd.conditions():
            for bigram in cfd[word]:
                cfd[word][bigram] = round(
                    float("{0:.6f}".format(cfd[word].freq(bigram))), 6)

        self.TRANSITION_MATRIX = cfd
        return cfd
Ejemplo n.º 8
0
class NgramModel(object):
    """A simple N-gram model."""

    def __init__(self, n, training_data):
        """Create an n order model using training_data."""
        # Set n and train
        self._n = n
        train_ngrams = _make_ngram_tuples(training_data, self._n)
        self._cfd = ConditionalFreqDist((context, event) for (context, event) in train_ngrams)
        self._estimators = dict((context, self._cfd[context]) for context in self._cfd.conditions())

    def prob(self, event, context):
        """Return the probability for an event in the provided context"""
        context = tuple(context)
        try:
            return self._estimators[context].freq(event)
        except KeyError:
            return 0.0

    def seqprob(self, seq):
        """Return the probability of a sequence."""
        prob = 1.0
        for context, event in _make_ngram_tuples(seq, self._n):
            prob *= self.prob(event, context)
        return prob

    def allngrams(self):
        """Return all N-grams observed by the model and their probabilities."""
        ngram_probs = (
            (event, context, self.prob(event, context)) for context, dist in self._estimators.items() for event in dist
        )
        return sorted(ngram_probs, key=itemgetter(1))
Ejemplo n.º 9
0
    def test_increment(self):
        # make sure that we can still mutate cfd normally
        text = "cow cat mouse cat tiger"
        cfd = ConditionalFreqDist()

        # create cfd with word length as condition 
        for word in tokenize.word_tokenize(text):
            condition = len(word)
            cfd[condition][word] += 1

        self.assertEqual(cfd.conditions(), [3,5])

        # incrementing previously unseen key is still possible
        cfd[2]['hi'] += 1
        self.assertEqual(set(cfd.conditions()),set([3,5,2])) # new condition added
        self.assertEqual(cfd[2]['hi'], 1) # key's frequency incremented from 0 (unseen) to 1
Ejemplo n.º 10
0
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this C{ContextTagger}'s L{_context_to_tag} table
        based on the given training data.  In particular, for each
        context C{I{c}} in the training data, set
        C{_context_to_tag[I{c}]} to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of C{self._context_to_tag} (if any) is discarded.

        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of C{(word, tag)} tuples.
        @param cutoff: If the most likely tag for a context occurs
            fewer than C{cutoff} times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()
        
        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context].inc(tag)
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)
Ejemplo n.º 11
0
def nltk_test_3():
	# For each token, count current word given previous word.
	# Create distribution object.
	# cfd = ConditionalFreqDist()
	# for word in word_tokenize(sent):
	# 	condition = len(word)
	# 	cfd[condition][word] += 1
	cfd = ConditionalFreqDist((len(word), word) for word in gutenberg.words('austen-persuasion.txt'))
	# Start predicting at the given word, say ’therefore’
	word = 'therefore'
	i = 1
	print cfd.N()
	print cfd.conditions()
	# Find all words that can possibly follow the current word and choose one at random
	while i <= 20:
		print word,
		lwords = cfd[word]
		follower = choice(lwords)
		word = follower
		i += 1
Ejemplo n.º 12
0
    def learn(self, A):
        total_y = float(len(A))
        self.cls_fd = cls_fd = FreqDist()
        self.feature_fd = feature_fd = FreqDist()
        pairs = []
        for x, y in A:
            cls_fd.inc(y)
            for feature in set(get_words(x)):
                pairs.append((y, feature))
                feature_fd.inc(feature)
        cfd = ConditionalFreqDist(pairs)

        if DEBUG:
            print cfd
            print cfd.conditions()
            #cfd.tabulate(samples=['gbs', 'build', 'spec', 'repo', 'config'])
            cfd.tabulate()
            for author in cfd.conditions():
                print 'AUTHOR:', author
                for word, count in cfd[author].items():
                    print '%5d %20s' % (count, word)

        self.voc = voc = feature_fd.keys()

        self.cls_feature_prob = cls_feature_prob = {}
        self.cls_and_feature_prob = cls_and_feature_prob = {}
        for cls, total in cls_fd.items():
            fd = cfd[cls]

            cls_feature_prob[cls] = wc = {}
            for word in voc:
                if word in fd:
                    cls_feature_prob[(cls, word)] = float(fd[word]) / total
                    cls_and_feature_prob[(cls, word)] = float(fd[word]) / total_y
                else:
                    cls_feature_prob[(cls, word)] = 1. / total
                    cls_and_feature_prob[(cls, word)] = 1. / total_y

        self.feature_prob = feature_prob = {}
        for word, count in feature_fd.items():
            feature_prob[word] = count / total_y
Ejemplo n.º 13
0
def subword_char_ngram(text_fileid_map, n):
    corpus_ngramitems = []
    for tid, text in text_fileid_map.iteritems():
        words = text.split()
        ngramitems = []
        for w in words:
            ngramitems.extend(ngrams(w, n))
        for ngramitem in ngramitems:
            corpus_ngramitems.append((tid, ngramitem))
    cfd = ConditionalFreqDist(corpus_ngramitems)
    print cfd.N()," ",len(cfd.conditions())
    return cfd
    CFDhelpers.cfd2csv(cfd=cfd, csvpath=csvpath)
Ejemplo n.º 14
0
    def postags(self,
                pos=None,
                sort=False,
                top=0,
                universal_tagset=False,
                ret_cond=False):
        '''Создает частотные словари или отсортированные по частоте списки
        частей речи'''
        def merge(tags):
            result = FreqDist()
            for tag in tags:
                result += cfd[tag]
            return result

        maps = {
            'NOUN': {'NN', 'NNS', 'NNP', 'NNPS'},
            'VERB': {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'},
            'ADJ': {'JJ', 'JJR', 'JJS'},
            'ADV': {'RB', 'RBR'
                    'RBS'},
        }

        cfd = ConditionalFreqDist()

        for sent in self._sents:
            #tokens = sent.untagging()
            tokens = sent.tags
            for tok, tag, lemma in tokens:
                cfd[tag][lemma.lower()] += 1
        cond = cfd.conditions()

        result = cfd

        if pos:
            if not universal_tagset and pos in maps:
                result = merge(maps[pos])
            else:
                result = cfd[pos]

        if top:
            result = _top(result, top)
        else:
            result = _sort(result, sort)

        if ret_cond:
            result = result, cond

        return result
Ejemplo n.º 15
0
def init_prob_unit():
    # initialize uniform prob distribution to t(e|f)
    print("Initializing Uniform Prob distribution")
    N = len(de_inp)
    if N != len(en_inp):
        print("number of lines in src and target don't match!")
    ten_de = CondFDist()
    for num in range(N):
        for de_word in de_inp[num].split():
            for en_word in en_inp[num].split():
                ten_de[de_word].inc(en_word)
    # make probs uniform
    for de_word in ten_de.conditions():
        for key in ten_de[de_word].keys():
            ten_de[de_word][key] = 1.0 / len(ten_de[de_word])
            # print(ten_de[de_word][key])
    return ten_de
def visualize_monthly_cfd(cfd, figuretitle, ylabel, imgoutpath):
    cfd_reverse = ConditionalFreqDist((entity, month) for month in cfd.conditions() for entity in list(cfd[month]))
    
    months = cfd.conditions()
    months.sort()
    
    barlabels = cfd_reverse.conditions()
    #print months
    print barlabels
    
    
    yitemsmatrix = []
    
    for entity in barlabels:
        row = []
        for month in months:
            row.append(cfd[month][entity])
        yitemsmatrix.append(row)
    
    
    if len(barlabels) == 0 or len(yitemsmatrix) == 0:
        return
    
    yitemsmatrix = np.array(yitemsmatrix)
    #yitemsmatrix = yitemsmatrix.T
    print yitemsmatrix.shape
    
    colors = plotter.get_n_colors(len(barlabels))
    months = map(lambda x : str(x), months)
    
    
    # partition the figure in case x axis gets too large by the number of months
    numofxitems = 5
    numoffigures = (len(months) / numofxitems ) + 1
    for i in range(numoffigures):
        matrix = yitemsmatrix[:, (i*numofxitems) : ((i+1)*numofxitems)] 
        print matrix
        xlabels = months[(i*numofxitems) : ((i+1)*numofxitems)]
        # save fig. pass img path with i
        figurename = figuretitle + " "+ str(i)
        cfdplotter.multiplebargraphs(barlabels, matrix.tolist(), colors, figurename, xlabels, ylabel, imgpath=imgoutpath)
Ejemplo n.º 17
0
from load_data import get_df, select_columns
from itertools import combinations
from nltk import ConditionalFreqDist

# get the data as a dataframe
df = get_df(shortname='clean_apx')
mask_data_source = df['DataSource'] == 'APX'
df_select = df[mask_data_source]

# choose subset of columns and cast all values as string
df_select = df_select[select_columns].astype(str)

# choose a smaller subset of columns to analyse
report_columns = select_columns[1:5]

# a list of all pairwise combinations
combo_count = 2
groupby_columns = list(combinations(report_columns, combo_count))

# create a list of tuples
groupby_column = list(groupby_columns[0])
arr = df_select[list(groupby_column)].values
pairs = list(tuple(map(tuple, arr)))

# and now for the good stuff
cfd = ConditionalFreqDist(pairs)
conditions = cfd.conditions()

import pdb
pdb.set_trace()
Ejemplo n.º 18
0
    def inspect(self, missed):
        """
        Inspect a testing session, and print data about tag accuracy
        
        :param missed: list of tuples of missed tags like:
            (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context)
        """

        # create a CFD so we can examine a matrix of incorrect vs correct tags
        # ms[1][1] = tag of a gold_tagged_word
        # ms[0][1] = tag of an hmm_tagged_word
        cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed)

        # initialize a hash to store mistakes by frequency
        mistakes = {}

        # print a table showing mistake frequency
        cfd.tabulate()
        msg("\n")

        # loop through mistake frequencies by gold standard tag, i.e., if we are
        # examining gold-standard 'IN', count what we incorrectly tagged it as
        conds = cfd.conditions()
        for g_tag in conds:
            for hmm_tag in cfd[g_tag].keys():
                # how many times did we incorrectly say g_tag was hmm_tag?
                count = cfd[g_tag][hmm_tag]

                # add these mistakes to the count
                if count not in mistakes.keys():
                    mistakes[count] = []
                mistakes[count].append((hmm_tag, g_tag))

        # get a list of all mistake types that occurred over a threshold, worst first
        mistake_counts = set([count for (count, mistake_set) in \
            mistakes.iteritems() if count > Tagger.mistake_threshold])
        mistake_counts = reversed(sorted(mistake_counts))

        # now create a list of mistake types to show the user, i.e., loop
        # through all types and if they are of a high-frequency type, add to list
        mistakes_to_halt = []
        for count in mistake_counts:
            mistake_set = mistakes[count]
            for mistake_tuple in mistake_set:
                mistakes_to_halt.append(mistake_tuple)
                msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \
                    mistake_tuple[1]))
        msg("\n")

        # create separators used when outputting missed word contexts
        sep_big = "---------------------------------------------------\n"
        sep_small = "\n-----------------------------------------\n"

        # loop through individual mistakes and, if they match the kind of error
        # we want to halt for, show the user the mistake as well as the sentence
        # context for both the gold-standard sentence and the hmm-tagged sentence
        response = None
        for missed_set in missed:
            if response not in ['q', 'Q']:
                (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \
                    gold_tagged_sent) = missed_set
                should_halt = False
                # determine whether the current mistake matches a mistake type
                # we want to halt for
                for pair in mistakes_to_halt:
                    if hmm_tagged_word[1] == pair[0] and \
                        gold_tagged_word[1] == pair[1]:
                        should_halt = True
                if should_halt:
                    msg("%sTagged '%s' with %s when it should have been %s.%s" %\
                    (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\
                        gold_tagged_word[1], sep_small))

                    msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                        gold_tagged_sent])))
                    msg(sep_small)
                    msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                        hmm_tagged_sent])))

                    # get user input to decide whether to keep going
                    response = raw_input("\n\nEnter to continue, Q to quit: ")
Ejemplo n.º 19
0
from load_data import get_df, select_columns
from itertools import combinations
from nltk import ConditionalFreqDist

# get the data as a dataframe
df = get_df(shortname='clean_apx')
mask_data_source = df['DataSource'] == 'APX'
df_select = df[mask_data_source]

# choose subset of columns and cast all values as string
df_select = df_select[select_columns].astype(str)

# choose a smaller subset of columns to analyse
report_columns = select_columns[1:5]

# a list of all pairwise combinations
combo_count = 2
groupby_columns = list(combinations(report_columns, combo_count))

# create a list of tuples
groupby_column = list(groupby_columns[0])
arr = df_select[list(groupby_column)].values
pairs = list(tuple(map(tuple, arr)))

# and now for the good stuff
cfd = ConditionalFreqDist(pairs)
conditions = cfd.conditions()

import pdb; pdb.set_trace()
        if token['lemma']:
            lemma_pos = token['lemma']+'.'+get_wordnet_pos(token['pos'])
            lemma_pairs.append((token['lemma'], short_tag))
            lemma_long_pairs.append((token['lemma'], long_tag))
        tagged_pairs.append((token['textlc'], short_tag))
    
    # Print vocabularies for each tag type
    for tag_type in tag_types:
        vocabulary_cfd = ConditionalFreqDist([(lemma, long_tag) for (lemma, long_tag) in lemma_long_pairs if long_tag == tag_type])
        print vocabulary_cfd.tabulate()
    
    #events_cfd = ConditionalFreqDist(tagged_pairs)
    # Conditional frequency distribution for (lemma, tag) pairs
    events_cfd = ConditionalFreqDist(lemma_pairs)
    
    unambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) < 2]
    
    ambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) > 1]
    
    print "Unambiguous Words"
    print events_cfd.tabulate(conditions=unambiguous_words)
    
    print "Ambiguous Words"
    print events_cfd.tabulate(conditions=ambiguous_words)
    
    sum_ambiguous_words = (sum(events_cfd[word].N() for word in ambiguous_words))
    sum_unambiguous_words = (sum(events_cfd[word].N() for word in unambiguous_words))
    
    total = sum_ambiguous_words + sum_unambiguous_words
    percentage = float(sum_ambiguous_words) / float(total)
 def find_word_matrices(self, newsidlist, processcontent=True, prepend="content"):
     dateroots = []
     datePOStag = []
     
     titleexclamation = [("newsid", "title_exclamation")]
     
     textPOStag = []
     textroots = [] 
     textrootsWpostag = []
     textliterals = []
     
     print prepend, " processing:"
     for newsid in newsidlist:
         print "newsid ",newsid
         filepath = extractnewsmetadata.newsid_to_filepath(newsid)
         content, title, date = extractnewsmetadata.get_news_article2(filepath)
         text = ""
         if processcontent:
             text = content
         else:
             text = title
             if "!" in title:
                 titleexclamation.append((newsid, 1))
             else:
                 titleexclamation.append((newsid, 0))
         
         words = texter.getwords(text)
         lemmata = SAKsParser.lemmatize_lexicon(words)
         for (literal, literalPOS, root, rootPOS) in lemmata:
             
             root = texter.cleanword(root)
             if (len(root) > 0) or (not root.isspace()):
                 #print root,
                 textPOStag.append((newsid, literalPOS))
                 textroots.append((newsid, root))
                 textrootsWpostag.append((newsid, root+" Wpostag "+rootPOS))
                 textliterals.append((newsid, literal+" Wpostag "+literalPOS))
                 dateroots.append((date, root))
                 datePOStag.append((date, literalPOS))
     
       
     cfd_dateroots = ConditionalFreqDist(dateroots)
     cfd_datepostag = ConditionalFreqDist(datePOStag)
     cfd_textpostag = ConditionalFreqDist(textPOStag)
     cfd_textroots = ConditionalFreqDist(textroots)
     cfd_textrootWpostag = ConditionalFreqDist(textrootsWpostag)
     cfd_textliterals = ConditionalFreqDist(textliterals)
     
     print "some id's", cfd_textroots.conditions()
     
     cfd_roottext = ConditionalFreqDist((word, docid) for docid in cfd_textroots.conditions()
                                        for word in list(cfd_textroots[docid])) 
             
     
     # cfd to csv  conditems as cols duzelt:
     csvpath = os.path.join(self.matrixpath, prepend+"-dateroot.csv")
     CFDhelpers.cfd_to_matrix(cfd_dateroots, csvpath)
     
     csvpath = os.path.join(self.matrixpath, prepend+"-datepostag.csv")
     CFDhelpers.cfd_to_matrix(cfd_datepostag, csvpath)
     
     csvpath = os.path.join(self.matrixpath, prepend+"-postagCOUNT.csv")
     CFDhelpers.cfd_to_matrix(cfd_textpostag, csvpath)
     
     termcountcsvpath = os.path.join(self.matrixpath, prepend+"termCOUNT.csv")
     CFDhelpers.cfd_to_matrix(cfd_textroots, termcountcsvpath)
     tfidfcsvpath = os.path.join(self.matrixpath, prepend+"termTFIDF.csv")
     texter.compute_tfidf_ondisc(termcountcsvpath, tfidfcsvpath)
             
     csvpath = os.path.join(self.matrixpath, prepend+"-rootcountindex.csv")
     CFDhelpers.cfd_to_matrix(cfd_roottext, csvpath)
     
     csvpath = os.path.join(self.matrixpath, prepend+"rootWpostagCOUNT.csv")
     CFDhelpers.cfd_to_matrix(cfd_textrootWpostag, csvpath)
     
     csvpath = os.path.join(self.matrixpath, prepend+"literalWpostagCOUNT.csv")
     CFDhelpers.cfd_to_matrix(cfd_textliterals, csvpath)
     
     
     # diger csv'lerden devam   6 Subat 05:42 uyuyuyuyuyuyu
     # kalklaklkalklklaklaklkal 15:32
     
     if not processcontent:
         print "keep exclamation !"
         IOtools.tocsv_lst(titleexclamation, os.path.join(self.matrixpath, prepend+"-exclamation.csv"))
class HMMTagger(object):
    global START_TAG
    START_TAG = "<s>"
    global END_TAG
    END_TAG = "</s>"
    global UNK
    UNK = "UNK"

    def __init__(self, training_sents, n=2, smoothing=None):
        self.n = n
        self.smoothing = smoothing
        self.tagged_sents = self.addStartAndEndMarkers(
            training_sents)  # this takes a lot of time
        self.train()  # this takes almost 4 seconds

    def train(self):
        """ Construct the conditional frequencies and probabilities """
        #extract tags from sentences

        tags = [tag for (_, tag) in self.tagged_sents]
        self.replaceUnique()
        self.emission_frequencies = ConditionalFreqDist(
            [tup[::-1] for tup in self.tagged_sents])
        self.tagset_size = len(self.emission_frequencies.conditions())

        # emission - probability that a certain tag is a certain word
        # e.g. probability that a VB is 'race'
        self.emission_probabilities = ConditionalProbDist(
            self.emission_frequencies, MLEProbDist)
        self.transition_frequencies = ConditionalFreqDist(bigrams(tags))
        self.transition_probabilities = ConditionalProbDist(
            self.transition_frequencies, MLEProbDist)
        self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents)

    def replaceUnique(self):
        """ Replaces unique words with the UNK label """
        word_frequencies = FreqDist([word for (word, _) in self.tagged_sents])
        self.lexicon_size = len(word_frequencies)
        hap = set(word_frequencies.hapaxes())
        res = [(UNK, tag) if word in hap else (word, tag)
               for (word, tag) in self.tagged_sents]
        self.tagged_sents = res

    def addStartAndEndMarkers(self, training_sents):
        """ returns a flat list of tokens """
        res = []
        for sent in training_sents:
            res += [(START_TAG, START_TAG)]
            res += sent
            res += [(END_TAG, END_TAG)]
        return res

    def get_transition_probability(self, prev_tag, tag):
        """ Returns probability of prev_tag being followed by tag.
		 Performs smoothing if specified in the command line."""
        if self.smoothing == "LAP":
            prev_tag_count = self.transition_frequencies[prev_tag].N()
            bigram_count = self.transition_frequencies[prev_tag].freq(
                tag) * prev_tag_count
            return (bigram_count + 1) / (1.0 * prev_tag_count +
                                         self.lexicon_size)
        else:
            return self.transition_probabilities[prev_tag].prob(tag)

    def viterbi_col(self, word, prev=None):
        """ General algorithm for a viterbi table column.
		This is only called once for every word. """
        vit = {}
        back = {}
        for tag in self.word_tag_frequencies[word].keys():
            if tag != START_TAG:
                if prev:

                    best_prev_tag = self.get_prev_tag(tag, prev, word)
                    transition_prob = self.get_transition_probability(
                        best_prev_tag, tag)
                    vit[tag] = prev[
                        best_prev_tag] * transition_prob * self.emission_probabilities[
                            tag].prob(word)
                    back[tag] = best_prev_tag

                else:
                    transition_prob = self.get_transition_probability(
                        START_TAG, tag)
                    vit[tag] = transition_prob * self.emission_probabilities[
                        tag].prob(word)
                    back[tag] = START_TAG

        return (vit, back)

    def viterbi(self, words_to_tag):
        """ Viterbi algorithm """
        res = [
        ]  # a list of dicts denoting probability of best path to get to state q after scanning input up to pos i
        backpointers = []  # a list of dicts
        for wordindex in range(len(words_to_tag)):
            current_word = words_to_tag[wordindex]
            if self.is_unknown(current_word):
                current_word = UNK
            if wordindex == 0:
                vit, back = self.viterbi_col(current_word)
            else:
                vit, back = self.viterbi_col(current_word, res[-1])

            res.append(vit)
            backpointers.append(back)

        prev = res[-1]
        backpointers.reverse()
        return self.construct_solution(backpointers, prev)

    def is_unknown(self, word):
        """ Checks if the word is unknown """
        for tag in set(self.emission_probabilities.conditions()):
            pr = self.emission_probabilities[tag]
            if pr.prob(word) > 0:
                return False
        return True

    def construct_solution(self, back, prev):
        """ Constructs solution by following the back pointers on a ready viterbi table """
        current_best_tag = self.get_prev_tag(END_TAG, prev)
        best_seq = [END_TAG, current_best_tag]
        for p in back:
            to_append = p[current_best_tag]
            best_seq.append(to_append)
            current_best_tag = p[current_best_tag]
        best_seq.reverse()
        return best_seq

    def get_prev_tag(self, tag, prev, curr_word=None):
        """ Finds a previous tag A for the current tag B s.t. the probability of AB was the highest
		for the current word.
		Called for every word and every tag """
        best_prev = prev.keys()[
            0]  # assign at least something to avoid None exception
        best_prob = 0.0
        for prevtag in prev.keys():
            # find the maximum probability
            prob = prev[prevtag] * self.transition_probabilities[prevtag].prob(
                tag)

            if curr_word:
                prob *= self.emission_probabilities[tag].prob(curr_word)

            if prob > best_prob:
                best_prob = prob
                best_prev = prevtag

        return best_prev

    def tag_sents(self, test_sents):
        """Tag the given text sentence by sentence"""
        res = []
        for sent in test_sents:
            res.append(self.viterbi(sent)[1:-1])  # remove start and end tags
        return res
Ejemplo n.º 23
0
from nltk import ConditionalFreqDist
from nltk.corpus import brown

# cfd = ConditionalFreqDist(
#     (genre, word)
#     for genre in brown.categories()
#     for word in brown.words(categories=genre)
# )
# print(len(cfd))  # 15 (categories)

cfd = ConditionalFreqDist((genre, word) for genre in ['news', 'romance']
                          for word in brown.words(categories=genre))
print(cfd)  # 2 (categories)
print(cfd.conditions())
print(cfd['romance'])  # FreqDist with 8452 samples and 70022 outcomes
Ejemplo n.º 24
0
#!/usr/bin/python
# coding: utf-8

# 2013/03/20

from nltk import ConditionalFreqDist

cfdist = ConditionalFreqDist(pairs) # pairs で指定されたデータの頻度分布を生成 (条件,事象)のペア
cfdist.conditions() # アルファード順にソートされた条件のリスト
cfdist['条件'] # 指定された条件の頻度分布
cfdist['条件'][sample] #
cfdist.tablate()
cfdist.tablate(samples,conditions)
cfdist.plot()
cfdist.plot(samples,conditions)
cfdist1 < cfdist2


Ejemplo n.º 25
0
def pos_percentages(words, tag='NN'):
    cfd = ConditionalFreqDist((tag,1) for word,tag in  tagger.tag(words))
    relevant_tags = filter(lambda c: re.match(tag,c), cfd.conditions())
    sum_tags = sum([ cfd[c].N() for c in  relevant_tags ])
    return float(sum_tags)/float(len(words))
Ejemplo n.º 26
0
print([word for word in sen_words if len(word) == 6
       ])  # ['没有物美价廉', '不会心平气和', '不是别出心裁', '不是结实耐用, ...]...; 这样玩没朋友啊...
# 情感值, 正负面
anls = [
    word for words in df[df.columns[4]] for word in words.split(';')
    if len(word)
]
print(len(anls))  # 45648
## combine sen_words and anls; 联合情感词和情感值, 找同一个次有不同词性标注的
print(sen_words[:10]
      )  # ['实惠', '快', '也好', '太长', '太贵', '不方便', '差', '无语', '满意', '好']
print(anls[:10])  # ['1', '1', '1', '-1', '-1', '-1', '-1', '-1', '1', '1']
con = ConditionalFreqDist(zip(sen_words, anls))
print(con)  # <ConditionalFreqDist with 3032 conditions>; 将相同的 key 合并了
print([
    condition for condition in con.conditions()
    if len(con[condition].keys()) > 1
])  # ['不容易', '不高']; Shit, 只有两个词有不同的情感值(-1, 0, 1)
## 将 theme, sentiment_word, anls 存
with open('./tmp_dataset/BDCI2017-taiyi/theme.txt', 'w') as f:
    f.write('\n'.join(themes))
with open('./tmp_dataset/BDCI2017-taiyi/word.txt', 'w') as f:
    f.write('\n'.join(sen_words))
with open('./tmp_dataset/BDCI2017-taiyi/word_score.txt', 'w') as f:
    f.write('\n'.join(word + ' ' + anls
                      for word, anls in zip(sen_words, anls)))
##################################################################
## 二: 数据预处理; 将 DataFrame 分为 四个 list 分别保存
# df = xlsx.parse("Sheet1")  # 因为上面把 NaN 换成了 NUll, 这里重新导入; 后来发现不用了, 使用的时候将 NULL 去掉就行了
contents = [str(word) for word in list(df[df.columns[1]].values)]
print(contents[:10])
Ejemplo n.º 27
0
# coding: utf-8
import nltk
from nltk import ConditionalFreqDist
from nltk.corpus import brown
from nltk.corpus import names
from nltk.corpus import inaugural
from nltk.corpus import toolbox
from nltk.corpus import udhr
##################################################################
## ConditionalFreqDist 简单应用: 文本情感分析
word = ['实惠', '快', '也好', '快', '也好']
anls = ['1', '1', '1', '-1', '1']
tmp_Con = ConditionalFreqDist(zip(word, anls))
print(tmp_Con)  # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了
print(tmp_Con.tabulate())
print(tmp_Con.conditions())  # ['实惠', '快', '也好']
print(tmp_Con['快'].most_common())  # [('1', 1), ('-1', 1)]
print(tmp_Con['快'].keys())  # dict_keys(['1', '-1'])
print(len(tmp_Con['快'].keys()))  # 2; 可以看到每个词语的词性有多少个...
print(len(tmp_Con['也好'].keys()))  # 1; 重复的已经 set() 化了
print([
    condition for condition in tmp_Con.conditions()
    if len(tmp_Con[condition].keys()) > 1
])  # ['快']
tmp_Con.plot()
tmp_Con_1 = ConditionalFreqDist(zip(anls, word))
print(tmp_Con_1.conditions())  # ['实惠', '快', '也好']
##################################################################
## Brown 语料库 word 归类分析
print(
    brown.categories()
Ejemplo n.º 28
0
 def test_plot(self):
     empty = ConditionalFreqDist()
     self.assertEqual(empty.conditions(), [])
     empty.plot(conditions=["BUG"])  # nonexistent keys shouldn't be added
     self.assertEqual(empty.conditions(), [])
print(set([len(word) for word in themes]))  # {1, 2, 3, 4, 5, 6}; 所有主题长度最长为 6
print([word for word in themes if len(word) == 6])  # ['iphone', 'iphone', 'iphone', 'iphone']; 居然是英文的...
# 情感关键词
sen_words = [word for words in df[df.columns[3]] for word in words.split(';') if len(word)]
print(len(sen_words))  # 45648
print(set([len(word) for word in sen_words]))  # {1, 2, 3, 4, 5, 6}; 所有主题长度最长为 6
print([word for word in sen_words if len(word) == 6])  # ['没有物美价廉', '不会心平气和', '不是别出心裁', '不是结实耐用, ...]...; 这样玩没朋友啊...
# 情感值, 正负面
anls = [word for words in df[df.columns[4]] for word in words.split(';') if len(word)]
print(len(anls))  # 45648
## combine sen_words and anls; 联合情感词和情感值, 找同一个次有不同词性标注的
print(sen_words[:10])  # ['实惠', '快', '也好', '太长', '太贵', '不方便', '差', '无语', '满意', '好']
print(anls[:10])  # ['1', '1', '1', '-1', '-1', '-1', '-1', '-1', '1', '1']
con = ConditionalFreqDist(zip(sen_words, anls))
print(con)  # <ConditionalFreqDist with 3032 conditions>; 将相同的 key 合并了
print([condition for condition in con.conditions() if len(con[condition].keys()) > 1])  # ['不容易', '不高']; Shit, 只有两个词有不同的情感值(-1, 0, 1)
## 将 theme, sentiment_word, anls 存
with open('./tmp_dataset/BDCI2017-taiyi/theme.txt', 'w') as f: f.write('\n'.join(themes))
with open('./tmp_dataset/BDCI2017-taiyi/word.txt', 'w') as f: f.write('\n'.join(sen_words))
with open('./tmp_dataset/BDCI2017-taiyi/word_score.txt', 'w') as f: f.write('\n'.join(word + ' ' + anls for word, anls in zip(sen_words, anls)))
##################################################################
## 二: 数据预处理; 将 DataFrame 分为 四个 list 分别保存
# df = xlsx.parse("Sheet1")  # 因为上面把 NaN 换成了 NUll, 这里重新导入; 后来发现不用了, 使用的时候将 NULL 去掉就行了
contents = [str(word) for word in list(df[df.columns[1]].values)]; print(contents[:10])
themes = [str(word) for word in list(df[df.columns[2]].values)]; print(themes[:10])
words = [str(word) for word in list(df[df.columns[3]].values)]; print(words[:10])
anls = [str(word) for word in list(df[df.columns[4]].values)]; print(anls[:10])
print('len of contents:', len(contents))  # len of contents: 20000
print('len of words:', len(words))  # len of words: 20000
## jieba 分词添加 themes, words
dict_themes = [word for line in themes for word in line.strip().split(';') if len(word) and word != 'NULL']
#!/usr/bin/python
#coding=utf-8

from nltk import ConditionalFreqDist
from nltk.corpus import brown

words = brown.tagged_words(tagset = 'universal')

# 哪个词的不同词性标记数目最多?
maximumTagNumber = 0
result = ''
cfd = ConditionalFreqDist((word.lower(), tag) for (word, tag) in words)
for word in cfd.conditions():
    if len(cfd[word]) > maximumTagNumber:
        maximumTagNumber = len(cfd[word])
        result = word + ' (' + ', '.join(tag for (tag, _) in cfd[word].most_common()) + ')'
    elif len(cfd[word]) == maximumTagNumber:
        result += '\n' + word + ' (' + ', '.join(tag for (tag, _) in cfd[word].most_common()) + ')'
print result
Ejemplo n.º 31
0
# for each tagged sentence in the corpus, get the (token, tag) pair and update
# both count(tag) and count(tag given token)
for sentence in brown.tagged_sents():
    for (token, tag) in sentence:
        fd[tag] += 1
        cfd[token][tag] += 1

# The most frequent tag is ...
print(fd.max())

# Initialize a list to hold (numtags,word) tuple
wordbins = []

# Append each (n(unique tags for token),token) tuple to list
for token in cfd.conditions():
    wordbins.append((cfd[token].B(), token))

# Sort tuples by number of unique tags (highest first)
wordbins.sort(reverse=True)

# The token with max. no. of tags is ...
print(wordbins[0])

# masculine pronouns
male = ['he', 'his', 'him', 'himself']

# feminine pronouns
female = ['she', 'hers', 'her', 'herself']

# initialize counters
Ejemplo n.º 32
0
from scipy.sparse import lil_matrix,csr_matrix

fids = [reuters.fileids()[0]]
docs = [[nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(reuters.raw(fileids=[fid]))] for fid in fids]
word_list = sorted(set(word for doc in docs for sent in doc for word in sent))
word_dict = dict((word,i) for i,word in enumerate(word_list))
idx_docs = [[[word_dict[word] for word in sent] for sent in doc]for doc in docs]
trigram_docs = [[discount.ngrams2(sent,3) for sent in doc] for doc in idx_docs]
tri_fd = CFreqDist(gram for doc in trigram_docs for sent in doc for gram in sent)
l = len(word_list)

# 最尤推定
float(tri_fd[cond][word])/tri_fd[cond]

A = lil_matrix(((l+1)**2,l+1))
for cond in tri_fd.conditions():
    n = float(tri_fd[cond].N())
    for word,val in tri_fd[cond].items():
        A[cond[0]*(l+1)+cond[1],word] = val/ n

A = lil_matrix(((l+1)**2,l+1),dtype=int)
for cond in tri_fd.conditions():
    for word,val in tri_fd[cond].items():
        A[cond[0]*(l+1)+cond[1],word] = val

# 最尤推定
B = A.tocsr()
B.toarray().astype(float)/B.sum(1)

# 加算スムージング
a=0.5
Ejemplo n.º 33
0
 def inspect(self, missed):
     """
     Inspect a testing session, and print data about tag accuracy
     
     :param missed: list of tuples of missed tags like:
         (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context)
     """
     
     # create a CFD so we can examine a matrix of incorrect vs correct tags
     # ms[1][1] = tag of a gold_tagged_word
     # ms[0][1] = tag of an hmm_tagged_word
     cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed)
     
     # initialize a hash to store mistakes by frequency
     mistakes = {}
     
     # print a table showing mistake frequency
     cfd.tabulate()
     msg("\n")
     
     # loop through mistake frequencies by gold standard tag, i.e., if we are
     # examining gold-standard 'IN', count what we incorrectly tagged it as
     conds = cfd.conditions()
     for g_tag in conds:
         for hmm_tag in cfd[g_tag].keys():
             # how many times did we incorrectly say g_tag was hmm_tag?
             count = cfd[g_tag][hmm_tag]
             
             # add these mistakes to the count
             if count not in mistakes.keys():
                 mistakes[count] = []
             mistakes[count].append((hmm_tag, g_tag))
             
     # get a list of all mistake types that occurred over a threshold, worst first
     mistake_counts = set([count for (count, mistake_set) in \
         mistakes.iteritems() if count > Tagger.mistake_threshold])
     mistake_counts = reversed(sorted(mistake_counts))
     
     # now create a list of mistake types to show the user, i.e., loop 
     # through all types and if they are of a high-frequency type, add to list
     mistakes_to_halt = []
     for count in mistake_counts:
         mistake_set = mistakes[count]
         for mistake_tuple in mistake_set:
             mistakes_to_halt.append(mistake_tuple)
             msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \
                 mistake_tuple[1]))
     msg("\n")
     
     # create separators used when outputting missed word contexts
     sep_big = "---------------------------------------------------\n"
     sep_small = "\n-----------------------------------------\n"
     
     # loop through individual mistakes and, if they match the kind of error
     # we want to halt for, show the user the mistake as well as the sentence
     # context for both the gold-standard sentence and the hmm-tagged sentence
     response = None
     for missed_set in missed:
         if response not in ['q','Q']:
             (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \
                 gold_tagged_sent) = missed_set
             should_halt = False
             # determine whether the current mistake matches a mistake type
             # we want to halt for
             for pair in mistakes_to_halt:
                 if hmm_tagged_word[1] == pair[0] and \
                     gold_tagged_word[1] == pair[1]:
                     should_halt = True
             if should_halt:
                 msg("%sTagged '%s' with %s when it should have been %s.%s" %\
                 (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\
                     gold_tagged_word[1], sep_small))
                 
                 msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                     gold_tagged_sent])))
                 msg(sep_small)
                 msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                     hmm_tagged_sent])))
                 
                 # get user input to decide whether to keep going
                 response = raw_input("\n\nEnter to continue, Q to quit: ")
# coding: utf-8
import nltk
from nltk import ConditionalFreqDist
from nltk.corpus import brown
from nltk.corpus import names
from nltk.corpus import inaugural
from nltk.corpus import toolbox
from nltk.corpus import udhr
##################################################################
## ConditionalFreqDist 简单应用: 文本情感分析
word = ['实惠', '快', '也好', '快', '也好']
anls = ['1', '1', '1', '-1', '1']
tmp_Con = ConditionalFreqDist(zip(word, anls))
print(tmp_Con)  # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了
print(tmp_Con.tabulate())
print(tmp_Con.conditions())  # ['实惠', '快', '也好']
print(tmp_Con['快'].most_common())  # [('1', 1), ('-1', 1)]
print(tmp_Con['快'].keys())  # dict_keys(['1', '-1'])
print(len(tmp_Con['快'].keys()))  # 2; 可以看到每个词语的词性有多少个...
print(len(tmp_Con['也好'].keys()))  # 1; 重复的已经 set() 化了
print([condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1])  # ['快']
tmp_Con.plot()
tmp_Con_1 = ConditionalFreqDist(zip(anls, word))
print(tmp_Con_1.conditions())  # ['实惠', '快', '也好']
##################################################################
## Brown 语料库 word 归类分析
print(brown.categories())  # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))  # 这里的 categories=genre 不能去掉
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']  # 从 brown.categories() 中找的
modals = ['can', 'could', 'may', 'might', 'must', 'will']  # 随机找的几个单词
print(cfd.tabulate(conditions=genres, samples=modals))  # Observe that the most frequent modal in the news genre is will, while the most frequent modal in the romance genre is could