Ejemplo n.º 1
0
    def buildTransitionMatrix(self, tagged_corpus: list, train_size):
        train = tagged_corpus[:int(train_size * len(tagged_corpus))]
        random.shuffle(train)
        #construction of the transition matrix
        transition = ConditionalFreqDist()
        for (tag1, tag2) in train:

            if tag1 not in transition:
                transition[tag1] = FreqDist()
            if tag2 not in transition[tag1]:
                transition[tag1][tag2] = 0.0

            transition[tag1][tag2] += 1

        for tag in transition.keys():
            somme = 0.0
            for value in transition[tag].values():
                somme += value
            for successor in transition[tag].keys():
                transition[tag][successor] = round(
                    float("{0:.6f}".format(transition[tag][successor] /
                                           somme)), 6)

        self.TRANSITION_MATRIX = transition
        return transition
Ejemplo n.º 2
0
def train_model():
    """Create ngram model from Project Gutenberg texts"""
    text = ''
    for corpus in CORPORA:
        with open(corpus, 'r') as file_:
            text += file_.read().replace('\n', '')

    sents = sent_tokenize(text.lower())
    tokens = []
    # appends <start> and <end> tokens to each sentence
    for sent in sents:
        sent = 'START ' + sent + ' END'
        tokens += word_tokenize(sent)

    ngrams_ = tuple(ngrams(tokens, N_VAL))

    # bigram frequency distribution
    bi_cfdist = ConditionalFreqDist((ngram[0], ngram[:2]) for ngram in ngrams_)

    # bigram probability distribution
    bi_cpdist = ConditionalProbDist(bi_cfdist, LaplaceProbDist)

    # conditional frequency distribution
    cfdist = ConditionalFreqDist(
        (ngram[:N_MINUS1], ngram) for ngram in ngrams_)

    # conditional probability
    cpdist = ConditionalProbDist(cfdist, LaplaceProbDist)

    return bi_cpdist, cpdist
Ejemplo n.º 3
0
    def constructTransitionMatrix(self, sourceFilesList: list):
        #construction of the transition matrix
        for fileName in sourceFilesList:
            file = open(fileName, 'r', encoding="windows-1256")
            fileFinal = ""
            for line in file:
                line = line.upper()
                if (len(line) > 1):
                    if not line.startswith("<S>"):
                        fileFinal += '<S> ' + line[:-1] + ' <E>\n'
                    else:
                        fileFinal += line[:-1] + '\n'
            file.close()

        tokens = [el for el in re.split("[\s\n]+", fileFinal) if el != '']
        self.initialProbabilities = FreqDist([
            tokens[i] for i in range(1, len(tokens)) if tokens[i - 1] == '<S>'
        ])

        self.tags = list(set(tokens))
        self.bigramDist = FreqDist(list(bigrams(tokens)))
        Trigrams = list(trigrams(tokens))
        cfd = ConditionalFreqDist(((el[2], (el[0], el[1])) for el in Trigrams))

        for word in cfd.conditions():
            for bigram in cfd[word]:
                cfd[word][bigram] = round(
                    float("{0:.6f}".format(cfd[word].freq(bigram))), 6)

        self.TRANSITION_MATRIX = cfd
        return cfd
Ejemplo n.º 4
0
    def __init__(self, fileid):
        try:
            # Reads the UDHR file
            corpus = udhr.raw(fileid)
        except:
            print("UDHR language file " + fileid + " does not exist",
                  file=sys.stderr)
            sys.exit(1)

        # Generate training dataset, lowercase and newlines converted to space
        self.train = re.sub(r'[\n]+', ' ', corpus[0:1000].strip().lower())
        # Generate dev dataset
        self.dev = corpus[1000:1100]

        # Convert training words to single characters
        tokens = list(self.train)
        self.unigram = tokens
        self.bigram = list(nltk.bigrams(tokens))
        self.trigram = list(nltk.trigrams(tokens))
        # Generate unigram frequency distirbution
        self.unigramFreq = FreqDist(self.unigram)
        # Generate bigram frequency distribution
        self.bigramFreq = ConditionalFreqDist(self.bigram)
        # Generate trigram frequency distribution
        self.trigramFreq = ConditionalFreqDist(
            list(((w0, w1), w2) for w0, w1, w2 in self.trigram))
    def build_top_words(self):
        pos_reviews = [(review, c) for (review, c) in self.documents
                       if c == 'pos']
        neg_reviews = [(review, c) for (review, c) in self.documents
                       if c == 'neg']

        pos_words = [token for (review, c) in pos_reviews for token in review]
        neg_words = [token for (review, c) in neg_reviews for token in review]

        fd_all = FreqDist(pos_words + neg_words)
        pos_class_words = [('pos', word) for word in pos_words]
        neg_class_words = [('neg', word) for word in neg_words]
        cfd_pos = ConditionalFreqDist(pos_class_words)
        cfd_neg = ConditionalFreqDist(neg_class_words)

        pos_word_count = len(pos_words)
        neg_word_count = len(neg_words)
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}

        for (word, freq) in fd_all.items():
            pos_score = BigramAssocMeasures.chi_sq(cfd_pos['pos'][word],
                                                   (freq, pos_word_count),
                                                   total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cfd_neg['neg'][word],
                                                   (freq, neg_word_count),
                                                   total_word_count)
            word_scores[word] = pos_score + neg_score

        best = sorted(word_scores.items(), reverse=True,
                      key=lambda x: x[1])[:1000]
        self.top_words = set([w for w, s in best])
Ejemplo n.º 6
0
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for category, words in wordsInCategories:
        word_fd.update(words)
        label_word_fd[category].update(words)

    word_counts = {}
    for condition in label_word_fd.conditions():
        word_counts[condition] = label_word_fd[condition].N()

    total_word_count = 0
    for condition, count in word_counts.items():
        total_word_count += count

    word_scores = {}

    for word, freq in word_fd.items():
        score = 0
        for condition, count in word_counts.items():
            score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count)
        word_scores[word] = score

    best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words]
    return set([w for w, s in best])
def visualize_monthly_news_stats2(csvfolder=metacorpus.statspath, csvname=metacorpus.prunedmetafilename,
                                 imgoutpath=metacorpus.imgfolder,
                                 rescatmap=metacorpus.resourcecategorymap2):
    colldf = IOtools.readcsv(csvfolder+os.sep+csvname)
    
    numoftexts, _ = colldf.values.shape
    
    
    # daily news counts for resources
    cfddailyresourcecount = ConditionalFreqDist((colldf.loc[i,"date"], colldf.loc[i,"resource"].strip()) for i in range(numoftexts))
    CFDhelpers.cfd2csv(cfddailyresourcecount, csvfolder+os.sep+"cfddailyresourcecount2.csv", colnames=['date','resource','count'])
    #cfdresourcesdaycount = ConditionalFreqDist((resource, day) for day in cfddailyresourcecount.conditions() for resource in list(cfddailyresourcecount[day]))
    
    
    # daily news counts for categories
    cfddailycategorycount = ConditionalFreqDist((colldf.loc[i,"date"], 
                                                 "_".join(map(lambda x : str(x).strip(), [colldf.loc[i, "resource"], colldf.loc[i, "category"]]))) for i in range(numoftexts)) 
    CFDhelpers.cfd2csv(cfddailycategorycount, csvfolder+os.sep+"cfddailycategorycount2.csv", ["date", "category", 'count'])
    #cfdcatsdaycount = ConditionalFreqDist((category, date) for date in cfddailycategorycount.conditions() for category in list(cfddailycategorycount[date]))

    
    
    # visualize monthly   --- assuming the dates are of the form yyyy-mm-dd -we did it so while recording
    
    cfdmonthlyresourcecount = ConditionalFreqDist((colldf.loc[i,"date"][:-3], colldf.loc[i,"resource"].strip()) for i in range(numoftexts))
    CFDhelpers.cfd2csv(cfdmonthlyresourcecount, csvfolder+os.sep+"cfdmonthlyresourcecount.csv", colnames=['month','resource','count'])
    #cfdresourcesmonthcount = ConditionalFreqDist((resource, month) for month in cfdmonthlyresourcecount.conditions() for resource in list(cfdmonthlyresourcecount[month]))
    imgpath = IOtools.ensure_dir(os.path.join(imgoutpath, "resourcebasednewscount"))
    visualize_monthly_cfd(cfd=cfdmonthlyresourcecount, figuretitle="Monthly news count for each resource", ylabel="news published", imgoutpath=imgpath)



    
    cfdmonthlycategorycount = ConditionalFreqDist((colldf.loc[i,"date"][:-3], 
                                                   "-".join(map(lambda x : str(x).strip(), [colldf.loc[i, "resource"], colldf.loc[i, "category"]]))) 
                                                  for i in range(numoftexts)) 
    CFDhelpers.cfd2csv(cfdmonthlycategorycount, csvfolder+os.sep+"cfdmonthlycategorycount.csv", ["month", "category", 'count'])
    #cfdcatsmonthcount = ConditionalFreqDist((category, month) for month in cfdmonthlycategorycount.conditions() for category in list(cfdmonthlycategorycount[month]))
    
    imgpath = IOtools.ensure_dir(os.path.join(imgoutpath, "categorybasednewscount"))
    for canoniccatname, rescatnamedct in rescatmap.iteritems():
        monthresourcepairs = []
        
        for resourcename, origcats in rescatnamedct.iteritems(): 
        
            for origcatname in origcats:
                #resourcename = rescat.split("-")[0]
                rescat = "-".join([resourcename, origcatname])
                for month in cfdmonthlycategorycount.conditions():
                    numofoccurrences = cfdmonthlycategorycount[month][rescat]
                    #print resourcename," had ",numofoccurrences," times texts in :",rescat," during ",month
                    for i in range(numofoccurrences):
                        monthresourcepairs.append((month, resourcename))
                        
        cfdmonthlyresourcecount_percat = ConditionalFreqDist(monthresourcepairs) 
            
        print canoniccatname,resourcename," * ",rescat," : ",len(cfdmonthlyresourcecount_percat.conditions()),"  ",cfdmonthlyresourcecount_percat.N()
        figuretitle = "Monthly news count of each resource over category "+canoniccatname.upper()
        visualize_monthly_cfd(cfdmonthlyresourcecount_percat, figuretitle, ylabel="news published", imgoutpath=imgpath)
Ejemplo n.º 8
0
    def __init__(self, corpus):
        """Initializer of the BigramWordCandidateProvider.

        Args:
            corpus: An iterable of word strings.
        """
        _bigrams = bigrams(corpus)
        self._cfd = ConditionalFreqDist(_bigrams)
    def calculate_vector_spaces(self,k=16):
        cfd = ConditionalFreqDist(
                   (word, doc['document'])
                   for doc in self.mongo[CORPUS_CLN].find()
                   for word in self.interestingWords(doc['document']))
        cfd.tabulate()

        # matrix dimensions
        terms = [c for c in cfd.conditions()] # conditions = words
        docs  = sorted(set(v for c in cfd.conditions() for v in cfd[c]))
        self.log("terms: %s"%str(terms))
        self.log("docs: %s"%str(docs))
        term_by_doc_mat = np.zeros(shape=(len(terms),len(docs)))
        self.log("Term-by-ref-document matrix shape is: %d X %d"%(len(terms),len(docs)))
        for i, term in enumerate(terms):
            li = np.array([cfd[term][doc] for doc in docs])
            term_by_doc_mat[i] = li
        self.log("Matrix\n%s"%str(term_by_doc_mat))

        # perform singular value decomposition
        u,sigma,vh = self._do_svd(term_by_doc_mat,k) 
        del term_by_doc_mat # don't need the matrix anymore

        # map terms to svd space
        terms_space = np.zeros(shape=(len(terms),k))
        for i in xrange(len(terms)):
            vals = [u[i][j] * sigma[j] for j in range(k)] # x-coord = row i, column 1
            terms_space[i] = np.array(vals)

        # map docs to svd space
        docs_space = np.zeros(shape=(len(docs),k))
        for i in xrange(len(docs)):
            vals = [ vh[i][j] * sigma[j] for j in range(k)]
            docs_space[i] = np.array(vals)

        # store matrix data
        row = self.mongo['data'].find_one()
        if not row:
            row = {'terms': terms, 
                   'documents':docs,
                   'terms_subspace':terms_space.tolist(),
                   'docs_subspace':docs_space.tolist(),
                   'u':u.tolist(),
                   'sigma':sigma.tolist(),
                   'vh':vh.tolist(),
                   'date':datetime.utcnow()}
        else:
            row['terms'] = terms
            row['documents'] = docs
            row['terms_subspace'] = terms_space.tolist()
            row['docs_subspace'] = docs_space.tolist()
            row['u'] = u.tolist()
            row['sigma'] = sigma.tolist()
            row['vh'] = vh.tolist()
            row['date'] = datetime.utcnow()

        self.mongo['data'].save(row)
        self.log("Saved matrix data")
Ejemplo n.º 10
0
 def __init__(self, n, training_data):
     """Create an n order model using training_data."""
     # Set n and train
     self._n = n
     train_ngrams = _make_ngram_tuples(training_data, self._n)
     self._cfd = ConditionalFreqDist(
         (context, event) for (context, event) in train_ngrams)
     self._estimators = dict((context, self._cfd[context])
                             for context in self._cfd.conditions())
Ejemplo n.º 11
0
    def conditional_freq(self):
        result = []
        cfd = ConditionalFreqDist(self.bigram_list)

        for key, values in cfd.items():
            for word, freq in values.items():
                result.append((key, word, freq))

        return result
Ejemplo n.º 12
0
    def tabulateWordsInAllGeners(self, theWords):
        """
		find the distribution of a word within all Brown corpus genres
		@params theWord: the word/list of words to find info about
		"""
        cdf = ConditionalFreqDist((genre, word)
                                  for genre in brown.categories()
                                  for word in brown.words(categories=genre))
        cdf.tabulate(samples=theWords, conditions=brown.categories())
Ejemplo n.º 13
0
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this C{ContextTagger}'s L{_context_to_tag} table
        based on the given training data.  In particular, for each
        context C{I{c}} in the training data, set
        C{_context_to_tag[I{c}]} to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of C{self._context_to_tag} (if any) is discarded.

        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of C{(word, tag)} tuples.
        @param cutoff: If the most likely tag for a context occurs
            fewer than C{cutoff} times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()
        
        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context].inc(tag)
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)
Ejemplo n.º 14
0
    def freq_dist_words(self):
        from nltk import ConditionalFreqDist
        from nltk.model import NgramModel
        categories = ['rev_neg.pos','rev_pos.pos']
        cfd = ConditionalFreqDist((category, word) for category in categories for word in c.ngrams(c.reader.words(category)))       
        genres = ['rev_neg.pos', 'rev_pos.pos']
        modals = ['location','room','size','staff','excellent','poor','good','bad']

        print 'neg :', cfd.__getitem__('rev_neg.pos')       
        print 'pos :', cfd.__getitem__('rev_pos.pos')
Ejemplo n.º 15
0
    def train_supervised(self, labelled_sequences, **kwargs):
        """
        Supervised training maximising the joint probability of the symbol and
        state sequences. This is done via collecting frequencies of
        transitions between states, symbol observations while within each
        state and which states start a sentence. These frequency distributions
        are then normalised into probability estimates, which can be
        smoothed if desired.

        @return: the trained model
        @rtype: HiddenMarkovModelTagger
        @param labelled_sequences: the training data, a set of
            labelled sequences of observations
        @type labelled_sequences: list
        @param kwargs: may include an 'estimator' parameter, a function taking
            a C{FreqDist} and a number of bins and returning a C{ProbDistI};
            otherwise a MLE estimate is used
        """

        # default to the MLE estimate
        estimator = kwargs.get('estimator')
        if estimator == None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurences of starting states, transitions out of each state
        # and output symbols observed in each state
        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[_TAG]
                symbol = token[_TEXT]
                if lasts == None:
                    starting.inc(state)
                else:
                    transitions[lasts].inc(state)
                outputs[state].inc(symbol)
                lasts = state

                # update the state and symbol lists
                if state not in self._states:
                    self._states.append(state)
                if symbol not in self._symbols:
                    self._symbols.append(symbol)

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, False, N)
        B = ConditionalProbDist(outputs, estimator, False, len(self._symbols))
                               
        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
Ejemplo n.º 16
0
 def __init__(self, file):
     corpus = udhr.raw(file)
     self.training_set = corpus[0:1000]
     token = list(self.training_set)
     self.unigram = token
     self.bigram = list(nltk.bigrams(token))
     self.trigram = list(nltk.trigrams(token))
     self.unigram_frequency = FreqDist(self.unigram)
     self.bigram_frequency = ConditionalFreqDist(self.bigram)
     self.trigam_frequency = ConditionalFreqDist(
         list(((x, y), z) for x, y, z in self.trigram))
Ejemplo n.º 17
0
    def tabulateWordsInPeriods(self, theWords):
        """
		find the distribution of words within the years, based in Inaugural corpus
		@params theWords: the word/list of words to find info about
		"""
        cdf = ConditionalFreqDist((textid[:4], target)
                                  for textid in inaugural.fileids()
                                  for word in inaugural.words(textid)
                                  for target in theWords
                                  if word.lower().startswith(target)
                                  or word.lower().endswith(target))
        cdf.tabulate()
Ejemplo n.º 18
0
def subword_char_ngram(text_fileid_map, n):
    corpus_ngramitems = []
    for tid, text in text_fileid_map.iteritems():
        words = text.split()
        ngramitems = []
        for w in words:
            ngramitems.extend(ngrams(w, n))
        for ngramitem in ngramitems:
            corpus_ngramitems.append((tid, ngramitem))
    cfd = ConditionalFreqDist(corpus_ngramitems)
    print cfd.N()," ",len(cfd.conditions())
    return cfd
    CFDhelpers.cfd2csv(cfd=cfd, csvpath=csvpath)
    def __init__(self, corpura):

        corpus = udhr.raw(corpura)

        self.TrainingSet = corpus[0:1000]
        token = list(self.TrainingSet)

        self.Uni = token
        self.Bi = list(nltk.bigrams(token))
        self.Tri = list(nltk.trigrams(token))

        self.UniFreq = FreqDist(self.Uni)
        self.BiFreq = ConditionalFreqDist(self.Bi)
        self.TriFreq = ConditionalFreqDist(
            list(((w1, w2), w3) for w1, w2, w3 in self.Tri))
Ejemplo n.º 20
0
    def postags(self,
                pos=None,
                sort=False,
                top=0,
                universal_tagset=False,
                ret_cond=False):
        '''Создает частотные словари или отсортированные по частоте списки
        частей речи'''
        def merge(tags):
            result = FreqDist()
            for tag in tags:
                result += cfd[tag]
            return result

        maps = {
            'NOUN': {'NN', 'NNS', 'NNP', 'NNPS'},
            'VERB': {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'},
            'ADJ': {'JJ', 'JJR', 'JJS'},
            'ADV': {'RB', 'RBR'
                    'RBS'},
        }

        cfd = ConditionalFreqDist()

        for sent in self._sents:
            #tokens = sent.untagging()
            tokens = sent.tags
            for tok, tag, lemma in tokens:
                cfd[tag][lemma.lower()] += 1
        cond = cfd.conditions()

        result = cfd

        if pos:
            if not universal_tagset and pos in maps:
                result = merge(maps[pos])
            else:
                result = cfd[pos]

        if top:
            result = _top(result, top)
        else:
            result = _sort(result, sort)

        if ret_cond:
            result = result, cond

        return result
Ejemplo n.º 21
0
 def __init__(self, sentences):
     # FIXME should use smoothing here. I tried SimpleGoodTuringProbDist but
     # it returns zero probability for event with freq=1. Possibly due to
     # too small test corpus
     self.cfd = ConditionalFreqDist(
         (ngram[:-1], ngram[-1]) for sentence in sentences
         for ngram in ngrams(sentence, 3, pad_left=True))
Ejemplo n.º 22
0
 def __init__(self, n, training_data):
     """Create an n order model using training_data."""
     # Set n and train
     self._n = n
     train_ngrams = _make_ngram_tuples(training_data, self._n)
     self._cfd = ConditionalFreqDist((context, event) for (context, event) in train_ngrams)
     self._estimators = dict((context, self._cfd[context]) for context in self._cfd.conditions())
Ejemplo n.º 23
0
class NgramModel(object):
    """A simple N-gram model."""

    def __init__(self, n, training_data):
        """Create an n order model using training_data."""
        # Set n and train
        self._n = n
        train_ngrams = _make_ngram_tuples(training_data, self._n)
        self._cfd = ConditionalFreqDist((context, event) for (context, event) in train_ngrams)
        self._estimators = dict((context, self._cfd[context]) for context in self._cfd.conditions())

    def prob(self, event, context):
        """Return the probability for an event in the provided context"""
        context = tuple(context)
        try:
            return self._estimators[context].freq(event)
        except KeyError:
            return 0.0

    def seqprob(self, seq):
        """Return the probability of a sequence."""
        prob = 1.0
        for context, event in _make_ngram_tuples(seq, self._n):
            prob *= self.prob(event, context)
        return prob

    def allngrams(self):
        """Return all N-grams observed by the model and their probabilities."""
        ngram_probs = (
            (event, context, self.prob(event, context)) for context, dist in self._estimators.items() for event in dist
        )
        return sorted(ngram_probs, key=itemgetter(1))
Ejemplo n.º 24
0
 def get_bigrams(self, text):
     list_bigrams = bigrams(text)
     cfd = ConditionalFreqDist(list_bigrams)
     list = []
     for i in cfd:
         list.append(cfd[i])
     return list
Ejemplo n.º 25
0
class NgramModel(object):
    """A simple N-gram model."""
    def __init__(self, n, training_data):
        """Create an n order model using training_data."""
        # Set n and train
        self._n = n
        train_ngrams = _make_ngram_tuples(training_data, self._n)
        self._cfd = ConditionalFreqDist(
            (context, event) for (context, event) in train_ngrams)
        self._estimators = dict((context, self._cfd[context])
                                for context in self._cfd.conditions())

    def prob(self, event, context):
        """Return the probability for an event in the provided context"""
        context = tuple(context)
        try:
            return self._estimators[context].freq(event)
        except KeyError:
            return 0.0

    def seqprob(self, seq):
        """Return the probability of a sequence."""
        prob = 1.0
        for context, event in _make_ngram_tuples(seq, self._n):
            prob *= self.prob(event, context)
        return prob

    def allngrams(self):
        """Return all N-grams observed by the model and their probabilities."""
        ngram_probs = ((event, context, self.prob(event, context))
                       for context, dist in self._estimators.items()
                       for event in dist)
        return sorted(ngram_probs, key=itemgetter(1))
def train_model_get_cosine_matrix(statements):
    statements = [statement.split() for statement in statements]

    frequencies = FreqDist(w for word in statements for w in word)

    conditionalFrequencies = ConditionalFreqDist(
                                (key,word)
                                for key in sorted(frequencies.keys())
                                for statement in statements
                                for word in statement 
                                if key in statement)
        
    pmi = [[npmi_scorer(frequencies[worda], 
                  frequencies[wordb], 
                  conditionalFrequencies[worda][wordb], 
                  len(frequencies.keys()),
                  2,
                  sum(frequencies[key] for key in frequencies.keys()))
        for wordb in sorted(frequencies.keys())]
        for worda in sorted(frequencies.keys())]
        
        
    pmi = np.array(pmi)
    pmi[np.isinf(pmi)] = -1
    pmi[np.where(pmi < 0)] = 0
        
    pmi = pd.DataFrame(pmi)
    pmi.columns = sorted(frequencies.keys())
    pmi.index = sorted(frequencies.keys())

    return pmi
Ejemplo n.º 27
0
    def test_increment(self):
        # make sure that we can still mutate cfd normally
        text = "cow cat mouse cat tiger"
        cfd = ConditionalFreqDist()

        # create cfd with word length as condition 
        for word in tokenize.word_tokenize(text):
            condition = len(word)
            cfd[condition][word] += 1

        self.assertEqual(cfd.conditions(), [3,5])

        # incrementing previously unseen key is still possible
        cfd[2]['hi'] += 1
        self.assertEqual(set(cfd.conditions()),set([3,5,2])) # new condition added
        self.assertEqual(cfd[2]['hi'], 1) # key's frequency incremented from 0 (unseen) to 1
Ejemplo n.º 28
0
def init_prob_unit():
    # initialize uniform prob distribution to t(e|f)
    print("Initializing Uniform Prob distribution")
    N = len(de_inp)
    if N != len(en_inp):
        print("number of lines in src and target don't match!")
    ten_de = CondFDist()
    for num in range(N):
        for de_word in de_inp[num].split():
            for en_word in en_inp[num].split():
                ten_de[de_word].inc(en_word)
    # make probs uniform
    for de_word in ten_de.conditions():
        for key in ten_de[de_word].keys():
            ten_de[de_word][key] = 1.0 / len(ten_de[de_word])
            # print(ten_de[de_word][key])
    return ten_de
Ejemplo n.º 29
0
 def train(self):
     """
     This trains a simple baseline which just uses majority class voting for every word in vocabulary
     disregarding of its context
     """
     self.word_pos_cfd = ConditionalFreqDist(
         tp for seq_list in self.corpus.train
         for tp in seq_list.get_tag_word_tuples())
Ejemplo n.º 30
0
def find_language(string):
    text=string.split(" ")
    text=[word for word in text if word.isalpha()]
    l=len(text)
    avail_langs=[file for file in udhr.fileids() if 'Latin1' in file]
    cfd=ConditionalFreqDist([(lang, word) for lang in avail_langs for word in [word for word in text if word in udhr.words(lang)]])
    ls=sorted([(lang,cfd[lang]) for lang in avail_langs], key=lambda tple: tple[1].N())
    print("The most probable language of the text is {0} with {1:3.3f}% probability.".format(ls[-1][0].replace('-Latin1',''), 100*ls[-1][1].N()/l))
Ejemplo n.º 31
0
 def suf_tag_freq(self):
     cfd = ConditionalFreqDist()
     for w in set(self.wt_freq.keys()) - set(self.c_words):
         for t in self.wt_freq[w].keys():
             for suf_len in xrange(1, max(self.max_suf_len, len(w))):
                 suf = w[-suf_len:]
                 cfd[suf].inc(t, self.wt_freq[w][t])
             cfd[''].inc(t)
     return cfd
Ejemplo n.º 32
0
def language_model(collection):
    from nltk import ConditionalProbDist
    from nltk import ConditionalFreqDist
    from nltk import bigrams
    from nltk import MLEProbDist
    words = tokenize_collection(collection)
    freq_model = ConditionalFreqDist(bigrams(words))
    prob_model = ConditionalProbDist(freq_model, MLEProbDist)
    return prob_model
    def train(self):
        """ Construct the conditional frequencies and probabilities """
        #extract tags from sentences

        tags = [tag for (_, tag) in self.tagged_sents]
        self.replaceUnique()
        self.emission_frequencies = ConditionalFreqDist(
            [tup[::-1] for tup in self.tagged_sents])
        self.tagset_size = len(self.emission_frequencies.conditions())

        # emission - probability that a certain tag is a certain word
        # e.g. probability that a VB is 'race'
        self.emission_probabilities = ConditionalProbDist(
            self.emission_frequencies, MLEProbDist)
        self.transition_frequencies = ConditionalFreqDist(bigrams(tags))
        self.transition_probabilities = ConditionalProbDist(
            self.transition_frequencies, MLEProbDist)
        self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents)
Ejemplo n.º 34
0
def get_lookup_tagger_accuracy(test_set, lookup_tagger_basis, corpus):
    words = [word for sent in lookup_tagger_basis for word in sent]
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(corpus.tagged_words())
    most_freq_words = fd.most_common(200)
    likely_tags = dict(
        (word[0], cfd[word[0]].max()) for (word, _) in most_freq_words)
    baseline_tagger = UnigramTagger(model=likely_tags)
    result = baseline_tagger.evaluate(test_set)
    return result
Ejemplo n.º 35
0
 def __get_conditional_freq_dist(self):
     t = trange(
         len(self.__ngram),
         desc=
         f'Creating Conditional frequency distributions for {len(self.__ngram[0])}-gram'
     )
     condition_pairs = []
     for i in t:
         words = self.__ngram[i]
         condition_pairs.append((tuple(words[:-1]), words[-1]))
     return ConditionalFreqDist(condition_pairs)
Ejemplo n.º 36
0
def nltk_test_3():
	# For each token, count current word given previous word.
	# Create distribution object.
	# cfd = ConditionalFreqDist()
	# for word in word_tokenize(sent):
	# 	condition = len(word)
	# 	cfd[condition][word] += 1
	cfd = ConditionalFreqDist((len(word), word) for word in gutenberg.words('austen-persuasion.txt'))
	# Start predicting at the given word, say ’therefore’
	word = 'therefore'
	i = 1
	print cfd.N()
	print cfd.conditions()
	# Find all words that can possibly follow the current word and choose one at random
	while i <= 20:
		print word,
		lwords = cfd[word]
		follower = choice(lwords)
		word = follower
		i += 1
Ejemplo n.º 37
0
def display():
    import pylab
    words_by_freq = FreqDist(brown.words(categories='news')).most_common(2**15)
    cfd = ConditionalFreqDist(brown.tagged_words(categories='news'))
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size')
    pylab.xlabel('Model Size')
    pylab.ylabel('Performance')
    pylab.show()
Ejemplo n.º 38
0
def conditional_dist():
    cfdist = ConditionalFreqDist()
    fileids = corpus.gutenberg.fileids()
    for id in fileids:
        condition = id
        filteredText = freq_dist_filter(corpus.gutenberg.words(condition))
        for word in filteredText:
            if word not in cfdist[condition]:
                cfdist[condition][word] = 0
            cfdist[condition][word] += 1
    return cfdist
def visualize_monthly_cfd(cfd, figuretitle, ylabel, imgoutpath):
    cfd_reverse = ConditionalFreqDist((entity, month) for month in cfd.conditions() for entity in list(cfd[month]))
    
    months = cfd.conditions()
    months.sort()
    
    barlabels = cfd_reverse.conditions()
    #print months
    print barlabels
    
    
    yitemsmatrix = []
    
    for entity in barlabels:
        row = []
        for month in months:
            row.append(cfd[month][entity])
        yitemsmatrix.append(row)
    
    
    if len(barlabels) == 0 or len(yitemsmatrix) == 0:
        return
    
    yitemsmatrix = np.array(yitemsmatrix)
    #yitemsmatrix = yitemsmatrix.T
    print yitemsmatrix.shape
    
    colors = plotter.get_n_colors(len(barlabels))
    months = map(lambda x : str(x), months)
    
    
    # partition the figure in case x axis gets too large by the number of months
    numofxitems = 5
    numoffigures = (len(months) / numofxitems ) + 1
    for i in range(numoffigures):
        matrix = yitemsmatrix[:, (i*numofxitems) : ((i+1)*numofxitems)] 
        print matrix
        xlabels = months[(i*numofxitems) : ((i+1)*numofxitems)]
        # save fig. pass img path with i
        figurename = figuretitle + " "+ str(i)
        cfdplotter.multiplebargraphs(barlabels, matrix.tolist(), colors, figurename, xlabels, ylabel, imgpath=imgoutpath)
Ejemplo n.º 40
0
 def __init__(self):
     """Initializes the del_probs and ins_probs variables to empty MLE probability distributions,
     and the sub_probs to an empty conditional probability distribution."""
     self.del_probs = MLEProbDist(
         FreqDist()
     )  # a MLE probability distribution representing how likely each character is to be deleted
     self.ins_probs = MLEProbDist(
         FreqDist()
     )  # a MLE probability distribution representing how likely each character is to be inserted
     self.sub_probs = ConditionalProbDist(
         ConditionalFreqDist(), MLEProbDist
     )  # a Conditional Probability Distribution representing how likely a given character is to be replaced by another character
Ejemplo n.º 41
0
    def learn(self, A):
        total_y = float(len(A))
        self.cls_fd = cls_fd = FreqDist()
        self.feature_fd = feature_fd = FreqDist()
        pairs = []
        for x, y in A:
            cls_fd.inc(y)
            for feature in set(get_words(x)):
                pairs.append((y, feature))
                feature_fd.inc(feature)
        cfd = ConditionalFreqDist(pairs)

        if DEBUG:
            print cfd
            print cfd.conditions()
            #cfd.tabulate(samples=['gbs', 'build', 'spec', 'repo', 'config'])
            cfd.tabulate()
            for author in cfd.conditions():
                print 'AUTHOR:', author
                for word, count in cfd[author].items():
                    print '%5d %20s' % (count, word)

        self.voc = voc = feature_fd.keys()

        self.cls_feature_prob = cls_feature_prob = {}
        self.cls_and_feature_prob = cls_and_feature_prob = {}
        for cls, total in cls_fd.items():
            fd = cfd[cls]

            cls_feature_prob[cls] = wc = {}
            for word in voc:
                if word in fd:
                    cls_feature_prob[(cls, word)] = float(fd[word]) / total
                    cls_and_feature_prob[(cls, word)] = float(fd[word]) / total_y
                else:
                    cls_feature_prob[(cls, word)] = 1. / total
                    cls_and_feature_prob[(cls, word)] = 1. / total_y

        self.feature_prob = feature_prob = {}
        for word, count in feature_fd.items():
            feature_prob[word] = count / total_y
Ejemplo n.º 42
0
#!/usr/bin/env python

from nltk.corpus import brown
from nltk import FreqDist, ConditionalFreqDist
fd = FreqDist()
cfd = ConditionalFreqDist()

# for each tagged sentence in the corpus, get the (token, tag) pair and update
# both count(tag) and count(tag given token)
for sentence in brown.tagged_sents():
    for (token, tag) in sentence:
        fd[tag] += 1
        cfd[token][tag] += 1

# The most frequent tag is ...
print(fd.max())

# Initialize a list to hold (numtags,word) tuple
wordbins = []

# Append each (n(unique tags for token),token) tuple to list
for token in cfd.conditions():
    wordbins.append((cfd[token].B(), token))

# Sort tuples by number of unique tags (highest first)
wordbins.sort(reverse=True)

# The token with max. no. of tags is ...
print(wordbins[0])

# masculine pronouns
Ejemplo n.º 43
0
from nltk.corpus import brown
from nltk import ConditionalFreqDist as CondFreqDist

categories = brown.categories()
words = ["likely" , "perhaps" , "probably" , "maybe" ]
words = ["female" , "male" , "gentleman" , "lady" , "boy" , "girl"]
cfd = CondFreqDist([(cat , word) for cat in categories\
					for word in brown.words(categories = cat)])
cfd.tabulate(conditions = categories , samples = words)

#!/usr/bin/python
#coding=utf-8

from nltk import ConditionalFreqDist
from nltk.corpus import brown

words = brown.tagged_words(tagset = 'universal')

# 哪个词的不同词性标记数目最多?
maximumTagNumber = 0
result = ''
cfd = ConditionalFreqDist((word.lower(), tag) for (word, tag) in words)
for word in cfd.conditions():
    if len(cfd[word]) > maximumTagNumber:
        maximumTagNumber = len(cfd[word])
        result = word + ' (' + ', '.join(tag for (tag, _) in cfd[word].most_common()) + ')'
    elif len(cfd[word]) == maximumTagNumber:
        result += '\n' + word + ' (' + ', '.join(tag for (tag, _) in cfd[word].most_common()) + ')'
print result
Ejemplo n.º 45
0
def modal_analysis(keyword_list, modals_list):
    cfd = ConditionalFreqDist(keyword_list, modals_list)
    return cfd.tabulate(conditions=keyword_list, samples=modals_list)
from nltk.corpus import brown
from nltk import ConditionalFreqDist as CondFreqDist

cfd = CondFreqDist(
    [
        (genre, word.lower())
        for genre in brown.categories()
        for target in ["romance", "news"]
        if genre.lower().startswith(target)
        for word in brown.words(categories=target)
    ]
)
days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "love", "political"]
cfd.tabulate(samples=days)
themes = [word for words in df[df.columns[2]] for word in words.split(';') if len(word)]
print(len(themes))  # 45648; 可以发现大部分只有 2 个主题
print(set([len(word) for word in themes]))  # {1, 2, 3, 4, 5, 6}; 所有主题长度最长为 6
print([word for word in themes if len(word) == 6])  # ['iphone', 'iphone', 'iphone', 'iphone']; 居然是英文的...
# 情感关键词
sen_words = [word for words in df[df.columns[3]] for word in words.split(';') if len(word)]
print(len(sen_words))  # 45648
print(set([len(word) for word in sen_words]))  # {1, 2, 3, 4, 5, 6}; 所有主题长度最长为 6
print([word for word in sen_words if len(word) == 6])  # ['没有物美价廉', '不会心平气和', '不是别出心裁', '不是结实耐用, ...]...; 这样玩没朋友啊...
# 情感值, 正负面
anls = [word for words in df[df.columns[4]] for word in words.split(';') if len(word)]
print(len(anls))  # 45648
## combine sen_words and anls; 联合情感词和情感值, 找同一个次有不同词性标注的
print(sen_words[:10])  # ['实惠', '快', '也好', '太长', '太贵', '不方便', '差', '无语', '满意', '好']
print(anls[:10])  # ['1', '1', '1', '-1', '-1', '-1', '-1', '-1', '1', '1']
con = ConditionalFreqDist(zip(sen_words, anls))
print(con)  # <ConditionalFreqDist with 3032 conditions>; 将相同的 key 合并了
print([condition for condition in con.conditions() if len(con[condition].keys()) > 1])  # ['不容易', '不高']; Shit, 只有两个词有不同的情感值(-1, 0, 1)
## 将 theme, sentiment_word, anls 存
with open('./tmp_dataset/BDCI2017-taiyi/theme.txt', 'w') as f: f.write('\n'.join(themes))
with open('./tmp_dataset/BDCI2017-taiyi/word.txt', 'w') as f: f.write('\n'.join(sen_words))
with open('./tmp_dataset/BDCI2017-taiyi/word_score.txt', 'w') as f: f.write('\n'.join(word + ' ' + anls for word, anls in zip(sen_words, anls)))
##################################################################
## 二: 数据预处理; 将 DataFrame 分为 四个 list 分别保存
# df = xlsx.parse("Sheet1")  # 因为上面把 NaN 换成了 NUll, 这里重新导入; 后来发现不用了, 使用的时候将 NULL 去掉就行了
contents = [str(word) for word in list(df[df.columns[1]].values)]; print(contents[:10])
themes = [str(word) for word in list(df[df.columns[2]].values)]; print(themes[:10])
words = [str(word) for word in list(df[df.columns[3]].values)]; print(words[:10])
anls = [str(word) for word in list(df[df.columns[4]].values)]; print(anls[:10])
print('len of contents:', len(contents))  # len of contents: 20000
print('len of words:', len(words))  # len of words: 20000
Ejemplo n.º 48
0
#!/usr/bin/python
# coding: utf-8

# 2013/03/20

from nltk import ConditionalFreqDist

cfdist = ConditionalFreqDist(pairs) # pairs で指定されたデータの頻度分布を生成 (条件,事象)のペア
cfdist.conditions() # アルファード順にソートされた条件のリスト
cfdist['条件'] # 指定された条件の頻度分布
cfdist['条件'][sample] #
cfdist.tablate()
cfdist.tablate(samples,conditions)
cfdist.plot()
cfdist.plot(samples,conditions)
cfdist1 < cfdist2


Ejemplo n.º 49
0
from load_data import get_df, select_columns
from itertools import combinations
from nltk import ConditionalFreqDist

# get the data as a dataframe
df = get_df(shortname='clean_apx')
mask_data_source = df['DataSource'] == 'APX'
df_select = df[mask_data_source]

# choose subset of columns and cast all values as string
df_select = df_select[select_columns].astype(str)

# choose a smaller subset of columns to analyse
report_columns = select_columns[1:5]

# a list of all pairwise combinations
combo_count = 2
groupby_columns = list(combinations(report_columns, combo_count))

# create a list of tuples
groupby_column = list(groupby_columns[0])
arr = df_select[list(groupby_column)].values
pairs = list(tuple(map(tuple, arr)))

# and now for the good stuff
cfd = ConditionalFreqDist(pairs)
conditions = cfd.conditions()

import pdb; pdb.set_trace()
#!/usr/bin/python3
# coding: utf-8
import nltk
from nltk import ConditionalFreqDist
from nltk.corpus import brown
from nltk.corpus import names
from nltk.corpus import inaugural
from nltk.corpus import toolbox
from nltk.corpus import udhr
##################################################################
## ConditionalFreqDist 简单应用: 文本情感分析
word = ['实惠', '快', '也好', '快', '也好']
anls = ['1', '1', '1', '-1', '1']
tmp_Con = ConditionalFreqDist(zip(word, anls))
print(tmp_Con)  # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了
print(tmp_Con.tabulate())
print(tmp_Con.conditions())  # ['实惠', '快', '也好']
print(tmp_Con['快'].most_common())  # [('1', 1), ('-1', 1)]
print(tmp_Con['快'].keys())  # dict_keys(['1', '-1'])
print(len(tmp_Con['快'].keys()))  # 2; 可以看到每个词语的词性有多少个...
print(len(tmp_Con['也好'].keys()))  # 1; 重复的已经 set() 化了
print([condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1])  # ['快']
tmp_Con.plot()
tmp_Con_1 = ConditionalFreqDist(zip(anls, word))
print(tmp_Con_1.conditions())  # ['实惠', '快', '也好']
##################################################################
## Brown 语料库 word 归类分析
print(brown.categories())  # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))  # 这里的 categories=genre 不能去掉
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']  # 从 brown.categories() 中找的
modals = ['can', 'could', 'may', 'might', 'must', 'will']  # 随机找的几个单词
 def find_word_matrices(self, newsidlist, processcontent=True, prepend="content"):
     dateroots = []
     datePOStag = []
     
     titleexclamation = [("newsid", "title_exclamation")]
     
     textPOStag = []
     textroots = [] 
     textrootsWpostag = []
     textliterals = []
     
     print prepend, " processing:"
     for newsid in newsidlist:
         print "newsid ",newsid
         filepath = extractnewsmetadata.newsid_to_filepath(newsid)
         content, title, date = extractnewsmetadata.get_news_article2(filepath)
         text = ""
         if processcontent:
             text = content
         else:
             text = title
             if "!" in title:
                 titleexclamation.append((newsid, 1))
             else:
                 titleexclamation.append((newsid, 0))
         
         words = texter.getwords(text)
         lemmata = SAKsParser.lemmatize_lexicon(words)
         for (literal, literalPOS, root, rootPOS) in lemmata:
             
             root = texter.cleanword(root)
             if (len(root) > 0) or (not root.isspace()):
                 #print root,
                 textPOStag.append((newsid, literalPOS))
                 textroots.append((newsid, root))
                 textrootsWpostag.append((newsid, root+" Wpostag "+rootPOS))
                 textliterals.append((newsid, literal+" Wpostag "+literalPOS))
                 dateroots.append((date, root))
                 datePOStag.append((date, literalPOS))
     
       
     cfd_dateroots = ConditionalFreqDist(dateroots)
     cfd_datepostag = ConditionalFreqDist(datePOStag)
     cfd_textpostag = ConditionalFreqDist(textPOStag)
     cfd_textroots = ConditionalFreqDist(textroots)
     cfd_textrootWpostag = ConditionalFreqDist(textrootsWpostag)
     cfd_textliterals = ConditionalFreqDist(textliterals)
     
     print "some id's", cfd_textroots.conditions()
     
     cfd_roottext = ConditionalFreqDist((word, docid) for docid in cfd_textroots.conditions()
                                        for word in list(cfd_textroots[docid])) 
             
     
     # cfd to csv  conditems as cols duzelt:
     csvpath = os.path.join(self.matrixpath, prepend+"-dateroot.csv")
     CFDhelpers.cfd_to_matrix(cfd_dateroots, csvpath)
     
     csvpath = os.path.join(self.matrixpath, prepend+"-datepostag.csv")
     CFDhelpers.cfd_to_matrix(cfd_datepostag, csvpath)
     
     csvpath = os.path.join(self.matrixpath, prepend+"-postagCOUNT.csv")
     CFDhelpers.cfd_to_matrix(cfd_textpostag, csvpath)
     
     termcountcsvpath = os.path.join(self.matrixpath, prepend+"termCOUNT.csv")
     CFDhelpers.cfd_to_matrix(cfd_textroots, termcountcsvpath)
     tfidfcsvpath = os.path.join(self.matrixpath, prepend+"termTFIDF.csv")
     texter.compute_tfidf_ondisc(termcountcsvpath, tfidfcsvpath)
             
     csvpath = os.path.join(self.matrixpath, prepend+"-rootcountindex.csv")
     CFDhelpers.cfd_to_matrix(cfd_roottext, csvpath)
     
     csvpath = os.path.join(self.matrixpath, prepend+"rootWpostagCOUNT.csv")
     CFDhelpers.cfd_to_matrix(cfd_textrootWpostag, csvpath)
     
     csvpath = os.path.join(self.matrixpath, prepend+"literalWpostagCOUNT.csv")
     CFDhelpers.cfd_to_matrix(cfd_textliterals, csvpath)
     
     
     # diger csv'lerden devam   6 Subat 05:42 uyuyuyuyuyuyu
     # kalklaklkalklklaklaklkal 15:32
     
     if not processcontent:
         print "keep exclamation !"
         IOtools.tocsv_lst(titleexclamation, os.path.join(self.matrixpath, prepend+"-exclamation.csv"))
     if token.has_key('tag'):
         if token['tag'] == None:
             short_tag = '--'
         else:
             short_tag = token['tag'][:2]+token['tag'][-1:]
         long_tag = token['tag']
     tag_types.add(long_tag)
     if token['lemma']:
         lemma_pos = token['lemma']+'.'+get_wordnet_pos(token['pos'])
         lemma_pairs.append((token['lemma'], short_tag))
         lemma_long_pairs.append((token['lemma'], long_tag))
     tagged_pairs.append((token['textlc'], short_tag))
 
 # Print vocabularies for each tag type
 for tag_type in tag_types:
     vocabulary_cfd = ConditionalFreqDist([(lemma, long_tag) for (lemma, long_tag) in lemma_long_pairs if long_tag == tag_type])
     print vocabulary_cfd.tabulate()
 
 #events_cfd = ConditionalFreqDist(tagged_pairs)
 # Conditional frequency distribution for (lemma, tag) pairs
 events_cfd = ConditionalFreqDist(lemma_pairs)
 
 unambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) < 2]
 
 ambiguous_words = [word for word in events_cfd.conditions() if len(events_cfd[word].items()) > 1]
 
 print "Unambiguous Words"
 print events_cfd.tabulate(conditions=unambiguous_words)
 
 print "Ambiguous Words"
 print events_cfd.tabulate(conditions=ambiguous_words)
Ejemplo n.º 53
0
from nltk.corpus import names
from nltk import ConditionalFreqDist as CondFreqDist

g2n = CondFreqDist([(gender, name[0]) for gender in names.fileids() for name in names.words(gender)])
n2g = CondFreqDist([(name[0] , gender) for gender in names.fileids() for name in names.words(gender)])
g2n.plot()
n2g.plot()

Ejemplo n.º 54
0
#!/usr/bin/python
#coding:utf-8

import nltk
from nltk.corpus import reuters
from nlp.discounting import discount
from nltk import ConditionalFreqDist as CFreqDist
from scipy.sparse import lil_matrix,csr_matrix

fids = [reuters.fileids()[0]]
docs = [[nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(reuters.raw(fileids=[fid]))] for fid in fids]
word_list = sorted(set(word for doc in docs for sent in doc for word in sent))
word_dict = dict((word,i) for i,word in enumerate(word_list))
idx_docs = [[[word_dict[word] for word in sent] for sent in doc]for doc in docs]
trigram_docs = [[discount.ngrams2(sent,3) for sent in doc] for doc in idx_docs]
tri_fd = CFreqDist(gram for doc in trigram_docs for sent in doc for gram in sent)
l = len(word_list)

# 最尤推定
float(tri_fd[cond][word])/tri_fd[cond]

A = lil_matrix(((l+1)**2,l+1))
for cond in tri_fd.conditions():
    n = float(tri_fd[cond].N())
    for word,val in tri_fd[cond].items():
        A[cond[0]*(l+1)+cond[1],word] = val/ n

A = lil_matrix(((l+1)**2,l+1),dtype=int)
for cond in tri_fd.conditions():
    for word,val in tri_fd[cond].items():
        A[cond[0]*(l+1)+cond[1],word] = val
Ejemplo n.º 55
0
 def inspect(self, missed):
     """
     Inspect a testing session, and print data about tag accuracy
     
     :param missed: list of tuples of missed tags like:
         (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context)
     """
     
     # create a CFD so we can examine a matrix of incorrect vs correct tags
     # ms[1][1] = tag of a gold_tagged_word
     # ms[0][1] = tag of an hmm_tagged_word
     cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed)
     
     # initialize a hash to store mistakes by frequency
     mistakes = {}
     
     # print a table showing mistake frequency
     cfd.tabulate()
     msg("\n")
     
     # loop through mistake frequencies by gold standard tag, i.e., if we are
     # examining gold-standard 'IN', count what we incorrectly tagged it as
     conds = cfd.conditions()
     for g_tag in conds:
         for hmm_tag in cfd[g_tag].keys():
             # how many times did we incorrectly say g_tag was hmm_tag?
             count = cfd[g_tag][hmm_tag]
             
             # add these mistakes to the count
             if count not in mistakes.keys():
                 mistakes[count] = []
             mistakes[count].append((hmm_tag, g_tag))
             
     # get a list of all mistake types that occurred over a threshold, worst first
     mistake_counts = set([count for (count, mistake_set) in \
         mistakes.iteritems() if count > Tagger.mistake_threshold])
     mistake_counts = reversed(sorted(mistake_counts))
     
     # now create a list of mistake types to show the user, i.e., loop 
     # through all types and if they are of a high-frequency type, add to list
     mistakes_to_halt = []
     for count in mistake_counts:
         mistake_set = mistakes[count]
         for mistake_tuple in mistake_set:
             mistakes_to_halt.append(mistake_tuple)
             msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \
                 mistake_tuple[1]))
     msg("\n")
     
     # create separators used when outputting missed word contexts
     sep_big = "---------------------------------------------------\n"
     sep_small = "\n-----------------------------------------\n"
     
     # loop through individual mistakes and, if they match the kind of error
     # we want to halt for, show the user the mistake as well as the sentence
     # context for both the gold-standard sentence and the hmm-tagged sentence
     response = None
     for missed_set in missed:
         if response not in ['q','Q']:
             (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \
                 gold_tagged_sent) = missed_set
             should_halt = False
             # determine whether the current mistake matches a mistake type
             # we want to halt for
             for pair in mistakes_to_halt:
                 if hmm_tagged_word[1] == pair[0] and \
                     gold_tagged_word[1] == pair[1]:
                     should_halt = True
             if should_halt:
                 msg("%sTagged '%s' with %s when it should have been %s.%s" %\
                 (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\
                     gold_tagged_word[1], sep_small))
                 
                 msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                     gold_tagged_sent])))
                 msg(sep_small)
                 msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                     hmm_tagged_sent])))
                 
                 # get user input to decide whether to keep going
                 response = raw_input("\n\nEnter to continue, Q to quit: ")
Ejemplo n.º 56
0
from nltk.corpus import gutenberg
from nltk import ConditionalFreqDist
from random import choice

#create the distribution object
cfd = ConditionalFreqDist()

## for each token count the current word given the previous word
prev_word = None
for word in gutenberg.words('austen-persuasion.txt'):
    cfd[prev_word][word] += 1
    prev_word = word

## start predicting at given word, say "therefore"
word = "therefore"
i = 1

## find all words that can follow the given word and choose one at random
while i<20:
    print word,
    lwords = cfd.get(word).keys()
    follower = choice(lwords)
    word = follower
    i += 1
    

Output data: Graph. 
"""

import nltk
wnl = nltk.WordNetLemmatizer()

from nltk.corpus import PlaintextCorpusReader
from nltk import ConditionalFreqDist

corpus = PlaintextCorpusReader('C:/Data/Candidate_tweets/Processing_tweets/By_week_tweets/Cleaned_by_week/', '.*')
corpus.fileids()[0:3]
print len(corpus.words())

cfd = ConditionalFreqDist(
    (target, fileid)
    for fileid in corpus.fileids()
    for w in corpus.words(fileid)
    for target in ['obama', 'romney', 'opponent']
    if w==target)
        
cfd.plot()


cfd = nltk.ConditionalFreqDist(
    (target, fileid)
    for fileid in corpus.fileids()
    for w in corpus.words(fileid)
    for target in ['democrat', 'republican', 'independent']
    if w==target)
        
cfd.plot()
Ejemplo n.º 58
0
def pos_percentages(words, tag='NN'):
    cfd = ConditionalFreqDist((tag,1) for word,tag in  tagger.tag(words))
    relevant_tags = filter(lambda c: re.match(tag,c), cfd.conditions())
    sum_tags = sum([ cfd[c].N() for c in  relevant_tags ])
    return float(sum_tags)/float(len(words))