Exemple #1
0
def get_high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    '''
    Gets the high information words using chi square measure
    '''
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    
    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1
    
    n_xx = label_word_fd.N()
    high_info_words = set()
    
    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)
        
        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score
        
        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)
    
    return high_info_words
Exemple #2
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        # raise NotImplementedError('HMM.emission_model')

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences

        # Repack the train_data to list(tuple(tag, lowercase_word)) format
        tagged_words = chain(train_data)
        data = [(tag, word.lower()) for (word, tag) in tagged_words]

        # Train the emission probilistic model
        emission_FD = ConditionalFreqDist(data)
        # Reseal the lidston function with gamma 0.01 and a proper bin number
        lidstone_PD = lambda FD: LidstoneProbDist(
            FD, gamma=0.01, bins=FD.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, lidstone_PD)
        # Store the tags as states
        self.states = emission_FD.conditions()

        return self.emission_PD, self.states
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        # raise NotImplementedError('HMM.emission_model')
        # TODO prepare data

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences
        data = []
        for sent in train_data:
            sent_parsed = list(map(lambda x: (x[1], x[0].lower()), sent))
            data.extend(sent_parsed)

        # TODO compute the emission model

        #print('pair num:', len(data))
        cfdist = ConditionalFreqDist(data)
        #print(cfdist.conditions())
        #print(len(dict(cfdist['ADP'])))
        cpdist = ConditionalProbDist(cfdist, myProbDist1, 0.01)
        emission_FD = cpdist
        self.emission_PD = emission_FD
        self.states = list(cfdist.conditions())
        #print(self.elprob('VERB','is'))
        #exit()
        return self.emission_PD, self.states
Exemple #4
0
def high_information_words(labeled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    """
    To eliminate low information feature words for set of words for EFFICIENCY
    :param labeled_words: list of 2 tuples [(label, words)]
                          label -> is a classification label (pos / neg)
                          words -> is a list of words that occur under that label
    :param score_fn: a scoring function to measure how informative that word is
    :param min_score: the minimum score for a word to be included as MOST INFORMATIVE WORD
    :return: a set of high informative words
    """
    print "Counting Word Frequencies"
    word_fq = FreqDist()
    labeled_word_fq = ConditionalFreqDist()

    for label, words in labeled_words:
        for word in words:
            word_fq[word] += 1
            labeled_word_fq[label][word] += 1
    n_xx = labeled_word_fq.N()
    high_info_words = set()

    for label in labeled_word_fq.conditions():
        n_xi = labeled_word_fq[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in labeled_word_fq[label].iteritems():
            n_ix = word_fq[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
        high_info_words |= set(bestwords)

    return high_info_words
   def _setSelectedPOSTags(self):

      buff = self._loadData('selective_pos.bin')

      if buff:
         self.selective_pos = buff
         return

      #First get all (word, tag) in corpuses
      sentences = brown.tagged_sents(simplify_tags=True)
      self.selected_tags = ["ADJ","ADV", "CNJ"]
      self.selective_pos = ConditionalFreqDist()
      temp_dist = ConditionalFreqDist()
      for sentence in sentences:
         for (word, tag) in sentence:
            if tag in self.selected_tags:
               temp_dist[tag].inc(str(word).lower())

      #Now, get the words with frequency > 10
      for category in temp_dist.conditions():
         fredist = temp_dist[category]
         for key in fredist.keys():
            if fredist[key] > 4:
               self.selective_pos[category].inc(key)

      self._saveData('selective_pos.bin',self.selective_pos)
Exemple #6
0
def high_information_words(files, score_fn=BigramAssocMeasures.chi_sq, min_score=50):
    word_dict = FreqDist()
    ocean_word_dict = ConditionalFreqDist()
    hiw_categories = []

    for file in files:
        # For each token, add 1 to the overall FreqDist and 1 to the ConditionalFreqDist under the current personality trait
        for token in file[0]:
            for trait in file[1]:
                ocean_word_dict[trait][token] += 1
            word_dict[token] += 1

    n_xx = ocean_word_dict.N()  # Get the total number of recordings in the ConditionalFreqDist
    high_info_words = set()

    for condition in ocean_word_dict.conditions():
        n_xi = ocean_word_dict[condition].N()  # Get the number of recordings for each personality trait
        word_scores = defaultdict(int)

        for word, n_ii in ocean_word_dict[condition].items():
            n_ix = word_dict[word]  # Get total number of recordings of a token
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        bw = list({k for k, v in sorted(word_scores.items(), key=lambda x: x[1], reverse=True)})
        high_info_words |= set(bestwords)
        hiw_categories.append((condition, bw[:10]))

    return high_info_words, hiw_categories
Exemple #7
0
def get_high_information_words(lwords, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    labels = lwords.keys()
    labelled_words = [(l, lwords[l]) for l in labels]
    word_freq_dist = FreqDist()
    label_word_freq_dist = ConditionalFreqDist()

    for label, dwords in labelled_words:
        for words in dwords:
            for word in words:
                word_freq_dist[word] += 1
                label_word_freq_dist[label][word] += 1

    n_words_total = label_word_freq_dist.N()
    high_info_words = set()

    for label in label_word_freq_dist.conditions():
        n_words_label = label_word_freq_dist[label].N()
        word_scores = defaultdict(int)

        for word, word_freq_label in label_word_freq_dist[label].items():
            word_freq = word_freq_dist[word]
            score = score_fn(word_freq_label, (word_freq, n_words_label), n_words_total)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)
    return high_info_words
Exemple #8
0
def high_information_words(labelled_words,
                           score_fn=BigramAssocMeasures.chi_sq,
                           min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(
                n_ii, (n_ix, n_xi), n_xx
            )  # n_ii is occurances in a label, n_ix is occurance in total,
            # n_xi is total words in this category, n_xx total words
            word_scores[word] = score

        bestwords = [
            word for word, score in word_scores.items() if score >= min_score
        ]

        high_info_words |= set(bestwords)  # bitwise or operation

    return high_info_words
def readFormatedData(formatedData):
    #unigramFd = FreqDist()
    #bigramFd = FreqDist()
    cBigramFd1 = ConditionalFreqDist()
    cBigramFd2 = ConditionalFreqDist()
    #dict1 = Set([])
    #dict2 = Set([])
    for tuple in formatedData:
        words = tuple[0].split(' ')
        count = int(tuple[1])
        #unigramFd.inc(words[0])
        #unigramFd.inc(words[1])
        #bigramFd.inc((words[0], words[1]), count)
        word2 = words[1]
        if count < 5:
            word2 = "unknown"
        cBigramFd1[words[0]].inc(word2, count)
        #if words[0] not in dict1:
        #    dict1.add(words[0])
        #if words[1] not in dict2:
        #    dict2.add(words[1])
    for w1 in cBigramFd1.conditions():
        bigram_w1 = cBigramFd1[w1]
        for w2 in bigram_w1.samples():
            cBigramFd2[w2].inc(w1, bigram_w1[w2])
    return cBigramFd1, cBigramFd2#, dict1, dict2
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()
	
	for label, words in labelled_words:
		for word in words:
			word_fd.inc(word)
			label_word_fd[label].inc(word)
	
	n_xx = label_word_fd.N()
	high_info_words = set()
	
	for label in label_word_fd.conditions():
		n_xi = label_word_fd[label].N()
		word_scores = collections.defaultdict(int)
		
		for word, n_ii in label_word_fd[label].iteritems():
			n_ix = word_fd[word]
			score = score_fn(n_ii, (n_ix, n_xi), n_xx)
			word_scores[word] = score
		
		bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
		high_info_words |= set(bestwords)
	
	return high_info_words
 def Ae_kappa(self, cA, cB):
     Ae = 0.0
     nitems = float(len(self.I))
     label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
     for k in label_freqs.conditions():
         Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
     return Ae
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                        tag != self.backoff.tag_one(
                        tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0) / token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained Unigram tagger:", end=' ')
            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning))
Exemple #13
0
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if self.backoff is None or tag != self.backoff.tag_one(
                        tokens, index, tags[:index]):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0) / token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained Unigram tagger:", end=" ")
            print("size={}, backoff={:.2f}%, pruning={:.2f}%]".format(
                size, backoff, pruning))
Exemple #14
0
def _dump_cfdist(cfdist: ConditionalFreqDist) -> dict:
    data: dict = {}
    for cond in cfdist.conditions():
        for k, v in cfdist[cond].items():
            if cond not in data:
                data[cond] = {}
            if k not in data[cond]:
                data[cond][k] = 0
            data[cond][k] += v
    return data
Exemple #15
0
class hmm:
    def __init__(self, name=0, tag=0):
        self.name = name
        self.tag = tag

        self.wsj = nltk.corpus.brown.tagged_words()
        self.sentences = nltk.corpus.brown.sents()
        #self.cfdTagAll = ConditionalFreqDist(tag, word for (word, tag) in self.wsj)

    def findTags(self, mostCommon=5):
        if self.tag != 0:
            self.tag_prefix = self.tag
            self.cfdTag = ConditionalFreqDist(
                (tag, word) for (word, tag) in self.wsj
                if tag.startswith(self.tag_prefix))
            return dict((tag, self.cfdTag[tag].most_common(mostCommon))
                        for tag in self.cfdTag.conditions())
        else:
            print("invalid method")

    def findAllTags(self, mostCommon=5):
        self.cfdTagAll = ConditionalFreqDist(
            (tag, word) for (word, tag) in self.wsj)
        for tag in sorted(self.cfdTagAll):
            print(tag, self.cfdTagAll[tag].most_common())
            #print(self.cfdTagAll)
        return dict(self.cfdTagAll)

    def findBigrams(self):
        self.bigram = bigrams([tag for word, tag in self.wsj])
        return self.bigram

    def biFrekvens(self, mostCommon=5):
        self.cfdBigram = ConditionalFreqDist(self.bigram)
        return dict((tag, self.cfdBigram[tag].most_common(mostCommon))
                    for tag in self.cfdBigram)

    def findName(self, mostCommon=5):
        if self.name != 0:
            self.cfdName = ConditionalFreqDist(
                (word.lower(), tag) for (word, tag) in self.wsj)
            return [self.name, self.cfdName[self.name].most_common(mostCommon)]
        else:
            print("invalid method")

    def findCPD(self, typecfd=None):
        if (typecfd == None):
            self.cpdTag = nltk.ConditionalProbDist(self.cfdTag,
                                                   nltk.MLEProbDist)
            return self.cpdTag
        elif (typecfd == "bi"):
            return ConditionalProbDist(self.cfdBigram, nltk.MLEProbDist)
        else:
            print("invalid method")
Exemple #16
0
def _train(self, tagged_corpus, cutoff=0, verbose=False): 
    token_count = hit_count = 0 
    useful_contexts = set() 
    fd = ConditionalFreqDist() 
    tag_prob = FreqDist()
    for sentence in tagged_corpus: 
        tokens, tags = zip(*sentence) 
        for index, (token, tag) in enumerate(sentence): 
            # Record the event. 
            token_count += 1 
            tag_prob.inc(tag)
            context = self.context(tokens, index, tags[:index])
            if context is None: continue 
            fd[context].inc(tag) 
            # If the backoff got it wrong, this context is useful: 
            if (self.backoff is None or 
                tag != self.backoff.tag_one(tokens, index, tags[:index])): 
                useful_contexts.add(context) 
    # Build the context_to_tag table -- for each context,  
    # calculate the entropy.  Only include contexts that 
    # lower then `cutoff` .
    total_tags = float(sum(tag_prob.values()))
    tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
    useful_contexts_after_filter = useful_contexts.copy()
    most_high = FreqDist()
    for context in useful_contexts:
        dd = fd[context]
#        total_tags = float(sum(dd.values()))
#        tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
        h = self.H(dd.keys(),tags_probs)
        if h > cutoff:
            useful_contexts_after_filter.remove(context)
            continue
        most_high[context] = h
    print most_high.keys()
    # Build the context_to_tag table -- for each context, figure
    # out what the most likely tag is.  
    for context in useful_contexts_after_filter:
        best_tag = fd[context].max()
        hits = fd[context][best_tag]
        self._context_to_tag[context] = best_tag
        hit_count += hits
    # Display some stats, if requested. 
    if verbose: 
        size = len(self._context_to_tag) 
        backoff = 100 - (hit_count * 100.0)/ token_count 
        pruning = 100 - (size * 100.0) / len(fd.conditions()) 
        print "[Trained Unigram tagger:", 
        print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
Exemple #17
0
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=0):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for label, sentences in labelled_words:
        for sent in sentences:
            words = preProcess(sent)
            for word in words:
                word_fd[word] += 1
                label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    labelScore = []
    for label in sorted(label_word_fd.conditions()):
        # if label == 0:
        #     min_score = 1.0
        # elif label == 1:
        #     min_score = 1.0
        # elif label == 2:
        #     min_score = 1.0
        # elif label == 3:
        #     min_score = 1.0
        # elif label == 4:
        #     min_score = 1.0

        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)
        labelScore.append(word_scores)

    which = 0
    for x in labelScore:
        sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)
        labelCSV = pd.DataFrame(sorted_x)
        fileName = "wang2226_%d.csv" % which
        labelCSV.to_csv(fileName, index=False, sep=',')
        which += 1

    return high_info_words
Exemple #18
0
def validate_pcfg_generate(grammar):
    pd = makeLhrProbDict(grammar)
    productions = []
    cfd = ConditionalFreqDist()

    for i in np.arange(1000):
        tree = pcfg_generate(grammar)
        productions += tree.productions()

    for p in productions:
        cfd[p.lhs()].inc(p.rhs())

    for c in cfd.conditions():
        p = MLEProbDist(cfd[c])
        q = pd[c]
        div = KL_Divergence(p, q)
        print "KL_Divergence for %s = %f" % (c, div)
Exemple #19
0
def validate_pcfg_generate(grammar):
    pd = makeLhrProbDict(grammar)
    productions = []
    cfd = ConditionalFreqDist()
    
    for i in np.arange(1000):
        tree = pcfg_generate(grammar)
        productions += tree.productions()    

    for p in productions:
        cfd[p.lhs()].inc(p.rhs())
        
    for c in cfd.conditions():
        p = MLEProbDist(cfd[c])
        q = pd[c]
        div = KL_Divergence(p,q)
        print "KL_Divergence for %s = %f" %(c , div)
Exemple #20
0
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        """
        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            tags = [",".join(sorted(x.split(","))) for x in tags]
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            #best_tag = fd[context].max()
            for (tag, hits) in fd[context].items():
                if hits > cutoff:
                    self._contexts_to_tags[context] = self._contexts_to_tags.get(context, {})
                    self._contexts_to_tags[context][tag] = hits
                    hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained Unigram tagger:")
            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                  size, backoff, pruning))
Exemple #21
0
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        """
        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            #best_tag = fd[context].max()
            for (tag, hits) in fd[context].items():
                if hits > cutoff:
                    self._contexts_to_tags[context] = self._contexts_to_tags.get(context, {})
                    self._contexts_to_tags[context][tag] = hits
                    hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)
Exemple #22
0
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.raw_freq, min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labelled_words:
         for word in words:
                word_fd.inc(word)
                label_word_fd[label].inc(word)
    n_xx = label_word_fd.N()
    high_info_words = set()
    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)
    for word, n_ii in label_word_fd[label].iteritems():
        n_ix = word_fd[word]
        score = score_fn(n_ii, (n_ix, n_xi), n_xx)
        word_scores[word] = score
    bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
    high_info_words |= set(bestwords)
    return high_info_words
Exemple #23
0
def sum_category_word_scores(categorized_words, score_fn):
	word_fd = FreqDist()
	category_word_fd = ConditionalFreqDist()
	
	for category, words in categorized_words:
		for word in words:
			word_fd.inc(word)
			category_word_fd[category].inc(word)
	
	scores = collections.defaultdict(int)
	n_xx = category_word_fd.N()
	
	for category in category_word_fd.conditions():
		n_xi = category_word_fd[category].N()
		
		for word, n_ii in iteritems(category_word_fd[category]):
			n_ix = word_fd[word]
			scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
	
	return scores
Exemple #24
0
def sum_category_word_scores(categorized_words, score_fn):
    word_fd = FreqDist()
    category_word_fd = ConditionalFreqDist()

    for category, words in categorized_words:
        for word in words:
            word_fd.inc(word)
            category_word_fd[category].inc(word)

    scores = collections.defaultdict(int)
    n_xx = category_word_fd.N()

    for category in category_word_fd.conditions():
        n_xi = category_word_fd[category].N()

        for word, n_ii in category_word_fd[category].iteritems():
            n_ix = word_fd[word]
            scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)

    return scores
def significantWords(untagged_docs, min_chisq=5, ratio=0.75):
    """ 
    Use chisq test of bigram contingency table to measure 
    the association of token with its sentiment

    Parameters
    ----------
    untagged_docs: list of tuples (words, tag)
    min_chisq: lower bound of significant
    ratio: pos/neg ratio, used to determine the sentiment of a word

    Returns
    -------
    significant_words: a 3-key-dict of words set

    """ 
    significant_words = collections.defaultdict(set)
    freq_dist = FreqDist()
    label_freq_dist = ConditionalFreqDist()
    stopping_words = set(nltk.corpus.stopwords.words('english'))
    for tokens, label in untagged_docs:
        for token in tokens:
            if token.isalpha() and not (token in stopping_words):
                freq_dist.inc(token)
                label_freq_dist[label].inc(token)
    n_xx = label_freq_dist.N()
    #pdb.set_trace()
    for label in label_freq_dist.conditions():
        for word, n_ii in label_freq_dist[label].iteritems():
            n_xi = label_freq_dist[label].N()
            n_ix = freq_dist[word]
            n_oi = n_xi-n_ii
            n_io = n_ix-n_ii
            n_oo = n_xx-n_oi-n_io-n_ii
            chisq = float(n_xx*(n_ii*n_oo - n_io*n_oi)**2)\
                    /((n_ii+n_io)*(n_ii+n_oi)*(n_oo+n_io)*(n_oo+n_oi))
            if chisq > min_chisq and n_ii>10:
                significant_words['total'] |= set([word])
                if float(n_ii)/n_ix > ratio and (n_ix-n_ii) > 1:
                    significant_words[label] |= set([word])
    return significant_words
def sum_category_word_scores(categorized_words, score_fn):
    # get word freq
    word_fd = FreqDist()
    # get conditional freq Dist
    category_word_fd = ConditionalFreqDist()
    # according to catagory
    for category, words in categorized_words:
        for word in words:
            word_fd.inc(word)
            category_word_fd[category].inc(word)

    scores = collections.defaultdict(int)
    n_xx = category_word_fd.N()

    for category in category_word_fd.conditions():
        n_xi = category_word_fd[category].N()

        for word, n_ii in category_word_fd[category].iteritems():
            n_ix = word_fd[word]
            scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
            # return the scores
    return scores
def high_information_words(labelled_words,
                           score_fn=BigramAssocMeasures.chi_sq,
                           min_score=5):
    """ returns a set of words with the highest information  """
    """
    n_ii : frequency for the word for the label
    n_ix : total freq for the word across all labels
    n_xi : total freq of all words that occured for the label
    n_xx : total freq for all words in all labels

    """

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [
            word for word, score in word_scores.items() if score >= min_score
        ]
        high_info_words |= set(bestwords)

    return high_info_words
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    # gathers the most frequently occuring features to improve classification
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1
            
    n_xx = label_word_fd.N()
    high_info_words = set()
    
    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)
        
    for word, n_ii in label_word_fd[label].items():
        n_ix = word_fd[word]
        score = score_fn(n_ii, (n_ix, n_xi), n_xx)
        word_scores[word] = score
        
    bestwords = [word for word, score in word_scores.items() if score>= min_score]
    high_info_words |= set(bestwords)
    return high_info_words
cfdist = ConditionalFreqDist()

# Indexador que representa a condicao da distribuicao
cfdist['a']

# Armazena o tamanho das palavras que comeca com a letra a
for token in corpus['SUBTOKENS']:
	if token['TEXT'][0] in ('A','a'):
		print token['TEXT'][0]
		cfdist['a'].inc(len(token['TEXT']))

# das palavras que comecam com 'a', quantas possuem 3 caracteres?
cfdist['a'].freq(3)

# lista as condicoes existentes
cfdist.conditions()

# In this example, we use a ConditionalFreqDist to examine
# how the distribution of a word's length is affected by the word's
# initial letter.

from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import ConditionalFreqDist
from nltk.draw.plot import Plot
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
WhitespaceTokenizer().tokenize(corpus)
cfdist = ConditionalFreqDist()

#How does initial letter affect word length?
for token in corpus['SUBTOKENS']:
Exemple #30
0
    def __init__(self,
                 n,
                 train,
                 pad_left=True,
                 pad_right=False,
                 estimator=None,
                 *estimator_args,
                 **estimator_kwargs):
        """
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

            >>> from nltk.corpus import brown
            >>> from nltk.probability import LidstoneProbDist
            >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
            ... # doctest: +ELLIPSIS
            0.5776...

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str) or list(list(str))
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
        :type pad_right: bool
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert (isinstance(pad_left, bool))
        assert (isinstance(pad_right, bool))

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

        # For explicitness save the check whether this is a unigram model
        self.is_unigram_model = (n == 1)
        # save the ngram order number
        self._n = n
        # save left and right padding
        self._lpad = ('', ) * (n - 1) if pad_left else ()
        self._rpad = ('', ) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()

        # set read-only ngrams set (see property declaration below to reconfigure)
        self._ngrams = set()

        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], compat.string_types):
            train = [train]

        for sent in train:
            raw_ngrams = ngrams(sent, n, pad_left, pad_right)
            for ngram in raw_ngrams:
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                if not (context, token) in cfd:
                    dict(cfd)[(context, token)] = 1
                else:
                    dict(cfd)[(context, token)] += 1
                #cfd[(context, token)] += 1

        self._probdist = estimator(cfd, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n - 1, train, pad_left, pad_right,
                                       estimator, *estimator_args,
                                       **estimator_kwargs)

            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0

                # this is the subset of words that we OBSERVED following
                # this context.
                # i.e. Count(word | context) > 0
                for word in self._words_following(ctxt, cfd):
                    total_observed_pr += self.prob(word, ctxt)
                    # we also need the total (n-1)-gram probability of
                    # words observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)

                assert (0 <= total_observed_pr <= 1), total_observed_pr
                # beta is the remaining probability weight after we factor out
                # the probability of observed words.
                # As a sanity check, both total_observed_pr and backoff_total_pr
                # must be GE 0, since probabilities are never negative
                beta = 1.0 - total_observed_pr

                # backoff total has to be less than one, otherwise we get
                # an error when we try subtracting it from 1 in the denominator
                assert (0 <= backoff_total_pr < 1), backoff_total_pr
                alpha_ctxt = beta / (1.0 - backoff_total_pr)

                self._backoff_alphas[ctxt] = alpha_ctxt
Exemple #31
0
class MorphProbModel():
    UNK_PROB = -99

    def __init__(self,
                 beam=1000,
                 max_guess=20,
                 rare_treshold=10,
                 capitalization=True):
        self._uni = FreqDist()
        self._bi = ConditionalFreqDist()
        self._tri = ConditionalFreqDist()

        self._wd = ConditionalFreqDist()

        self._l1 = 0.0
        self._l2 = 0.0
        self._l3 = 0.0

        self._beam_size = beam
        self._use_capitalization = capitalization
        self._max_guess = max_guess
        self._treshold = rare_treshold

        self._unk = Guesser(10)
        self._analyzer = None
        self.cache = {}

    def set_analyzer(self, obj):
        self._analyzer = obj

    def train(self, data):
        C = False
        for sent in data:
            history = [('BOS', False), ('BOS', False)]
            for w, l, t in sent:
                # Ezt azért szedtem ki mert megeszik 4 giga memóriát ha marad
                # t = encode((w, l, t))
                if self._use_capitalization and w[0].isupper():
                    C = True

                self._wd[w].inc(t)
                self._uni.inc((t, C))
                self._bi[history[1]].inc((t, C))
                self._tri[tuple(history)].inc((t, C))

                history.append((t, C))
                history.pop(0)

                C = False

        for word, fd in self._wd.iteritems():
            for tag, count in  fd.iteritems():
                if count < self._treshold:
                    self._unk.add_word(word.lower(), tag, count)
        self._unk.finalize()

        self._compute_lambda()

    def _compute_lambda(self):
        tl1 = 0.0
        tl2 = 0.0
        tl3 = 0.0

        for history in self._tri.conditions():
            (h1, h2) = history

            for tag in self._tri[history].samples():

                if self._uni[tag] == 1:
                    continue

                c3 = self._safe_div((self._tri[history][tag] - 1),
                                    (self._tri[history].N() - 1))
                c2 = self._safe_div((self._bi[h2][tag] - 1),
                                    (self._bi[h2].N() - 1))
                c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1))

                if (c1 > c3) and (c1 > c2):
                    tl1 += self._tri[history][tag]

                elif (c2 > c3) and (c2 > c1):
                    tl2 += self._tri[history][tag]

                elif (c3 > c2) and (c3 > c1):
                    tl3 += self._tri[history][tag]

                elif (c3 == c2) and (c3 > c1):
                    tl2 += float(self._tri[history][tag]) / 2.0
                    tl3 += float(self._tri[history][tag]) / 2.0

                elif (c2 == c1) and (c1 > c3):
                    tl1 += float(self._tri[history][tag]) / 2.0
                    tl2 += float(self._tri[history][tag]) / 2.0

                else:
                    pass

        self._l1 = tl1 / (tl1 + tl2 + tl3)
        self._l2 = tl2 / (tl1 + tl2 + tl3)
        self._l3 = tl3 / (tl1 + tl2 + tl3)

    def _safe_div(self, v1, v2):
        if v2 == 0:
            return -1
        else:
            return float(v1) / float(v2)

    def _transition_prob(self, t, C, history):
        p_uni = self._uni.freq((t, C))
        p_bi = self._bi[history[-1]].freq((t, C))
        p_tri = self._tri[tuple(history[-2:])].freq((t, C))
        p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri
        if p == 0.0:
            return self.UNK_PROB
        return log(p, 2)

    def _known_lexical_prob(self, word, t, C):
        p = float(self._wd[word][t]) / float(self._uni[(t, C)])
        return log(p, 2)

    def _analyze(self, word):
        tag_candidates = []
        if word in self._wd.conditions():
            tag_candidates = set(self._wd[word].samples())
        else:
            analyses = map(itemgetter(1), self._analyzer.analyze(word))
            guesses = self._unk.get_probs(word.lower())
            guesses = map(itemgetter(0),
                          sorted(guesses.iteritems(), reverse=True,
                     key=itemgetter(1))[:self._max_guess])
            tag_candidates = set(guesses)
            if analyses:
                tag_candidates &= set(analyses)
            if not tag_candidates:
                tag_candidates = set(guesses)
        return tag_candidates

    def _lexical_prob(self, word, t, C):
        if word in self._wd.conditions():
            return self._known_lexical_prob(word, t, C)
        else:
            return self._unk.get_prob(word, t)

    def tag(self, sent, n=5):
        current_state = [(['BOS', 'BOS'], 0.0)]
        out = self._tagword(sent, current_state, n)
        return out

    def _tagword(self, sent, current_states, n=5):
        # A cache-sel elég gyors. Nem érdemes jobban vesződni vele.
        if sent == []:
            # yield ...
            return [(map(itemgetter(0), tag_seq[0][2:]),
                          tag_seq[1]) for tag_seq in current_states[:n]]

        word = sent[0]
        sent = sent[1:]
        new_states = []

        # Cache lookup
        sent_str = word + str(current_states)
        if sent_str in self.cache:
            return self._tagword(sent, self.cache[sent_str], n)

        C = False
        if self._use_capitalization and word[0].isupper():
            C = True

        analyses = self._analyze(word)

        for (history, curr_sent_logprob) in current_states:
            logprobs = []

            for t in analyses:

                p_t = self._transition_prob(t, C, history)
                p_l = self._lexical_prob(word, t, C)

                p = p_t + p_l

                logprobs.append(((t, C), p))

            for (tag, logprob) in logprobs:
                new_states.append((history + [tag],
                                   curr_sent_logprob + logprob))

        new_states.sort(reverse=True, key=itemgetter(1))

        if len(new_states) > self._beam_size:
            new_states = new_states[:self._beam_size]

        # Cache store
        self.cache[sent_str] = new_states

        # yield new_states
        # self._tagword(sent, new_states, n)
        return self._tagword(sent, new_states, n)
Exemple #32
0
        n_word = get_positional_n(n, context)
        if n_word != '' and n > 0:
            condition = str(n) + "_ahead_" + re.sub(r'\_', '', n_word)
            cfd[condition][sense] += 1
        elif n_word != '' and n < 0:
            condition = str(n) + "_behind_" + re.sub(r'\_', '', n_word) 
            cfd[condition][sense] += 1
    return cfd
    
def get_positional_n(n, corpus):
    root_index = corpus.index(target) #position of line
    n_word_index = root_index + n #position of target word
    if len(corpus) > n_word_index and n_word_index >= 0:
        return corpus[n_word_index]
    else:
        return ""

cfd = ConditionalFreqDist()
Window = range(-5,5)
for i in Window:
    if i != 0:
        cfd = add_condition(cfd, training_list, i)    
        

cpd = ConditionalProbDist(cfd,ELEProbDist,10)

for cond in cfd.conditions():
    


        
Exemple #33
0
class ngram(object):
    def __init__(self, load_from_disk=True):
        self._corpus = reuters.words()

        self._unigram_fd = FreqDist()
        self._bigram_cfd = ConditionalFreqDist()
        self._trigram_cfd = ConditionalFreqDist()
        self._quadgram_cfd = ConditionalFreqDist()

        self._unigram_pd = None
        self._bigram_cpd = None
        self._trigram_cpd = None
        self._quadgram_cpd = None

        if load_from_disk:
            self._load_models()
        else:
            self._train()

    def _train(self):
        print 'Training models...'
        start_time = time.time()

        prev_word = None
        prev_2_word = None
        prev_3_word = None
        for word in self._corpus:
            if word.isalpha():
                self._unigram_fd[word] += 1
                self._bigram_cfd[prev_word][word] += 1
                self._trigram_cfd[tuple([prev_2_word, prev_word])][word] += 1
                self._quadgram_cfd[tuple([prev_3_word, prev_2_word, prev_word])][word] += 1
                prev_3_word = prev_2_word
                prev_2_word = prev_word
                prev_word = word

        self._unigram_pd = LaplaceProbDist(self._unigram_fd, bins=self._unigram_fd.N())
        self._bigram_cpd = ConditionalProbDist(self._bigram_cfd, LaplaceProbDist, bins=len(self._bigram_cfd.conditions()))
        self._trigram_cpd = ConditionalProbDist(self._trigram_cfd, LaplaceProbDist, bins=len(self._trigram_cfd.conditions()))
        self._quadgram_cpd = ConditionalProbDist(self._quadgram_cfd, LaplaceProbDist, bins=len(self._quadgram_cfd.conditions()))
        
        print 'Models trained, took %s seconds' % (time.time() - start_time)
        
        self._save_models()

    def _save_models(self):
        print 'Saving Models to disk...'
        start_time = time.time()

        pickle.dump(self._unigram_pd, open('./unigram_pd.p', 'w'))
        pickle.dump(self._bigram_cpd , open('./bigram_cpd.p', 'w'))
        pickle.dump(self._trigram_cpd, open('./trigram_cpd.p', 'w'))
        pickle.dump(self._quadgram_cpd, open('./quadgram_cpd.p', 'w'))

        print 'Models saved, took %s seconds' % (time.time() - start_time)

    def _load_models(self):
        
        if not (isfile('./unigram_pd.p') and isfile('./bigram_cpd.p') and isfile('./trigram_cpd.p') and isfile('./quadgram_cpd.p')):
            self._train()
            return

        print 'Loading Models from disk...'
        start_time = time.time()

        self._unigram_pd = pickle.load(open('./unigram_pd.p', 'r'))
        self._bigram_cpd = pickle.load(open('./bigram_cpd.p', 'r'))
        self._trigram_cpd = pickle.load(open('./trigram_cpd.p', 'r'))
        self._quadgram_cpd = pickle.load(open('./quadgram_cpd.p', 'r'))

        print 'Models loaded, took %s seconds' % (time.time() - start_time)

    def next_word(self, context):
        context = word_tokenize(context)
        word = self._quadgram_cpd[tuple(context[-3:])].max()
        return word
Exemple #34
0
class NgramModel(ModelI):
    """
    A processing interface for assigning a probability to the next word.
    """

    # add cutoff
    def __init__(self,
                 n,
                 docs=(),
                 pad_left=False,
                 pad_right=False,
                 estimator=_estimator,
                 cache_training=None,
                 **estimator_kwargs):
        """
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training. See model.doctest for more detailed testing

            >>> from nltk.corpus import brown
            >>> lm = NgramModel(3, brown.words(categories='news'))
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
            >>> lm.entropy(brown.words(categories='humor'))
            ... # doctest: +ELLIPSIS
            12.0399...

        NB: If a ``bins`` parameter is given in the ``estimator_kwargs``
        it will be ignored.  The number of bins to use is the number of
        outcomes (tokens) encountered at each level of the backoff recursion
        and as such, the number must change each time.

        :param n: the order of the language model (ngram size)
        :type n: int
        :param docs: the training text.  This needs to be a list, tuple, generator,
            or an iterable that yields such.
        :type docs: list(str) or list(list(str))
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
        :type pad_right: bool
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """
        super(NgramModel, self).__init__()

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))


        # Check for bins argument
        if 'bins' in estimator_kwargs:
            warnings.warn('A value was provided for the `bins` parameter of '
                          '`estimator_kwargs`.  This value will be overridden.'
                          'If you think you have a better idea, write your own '
                          'darn model.')
            # Clear out the bins so we don't throw recursive warnings
            estimator_kwargs.pop('bins', None)

        # TODO: I never understood why this used an ngram to do the actual padding
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()
        self._pad_left = pad_left
        self._pad_right = pad_right

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n
        self._unigram_model = (n == 1)
        self._n = n

        # Declare all other fields
        self._backoff = None
        if not self._unigram_model:
            # FIXME: estimator_kwargs
            self._backoff = NgramModel(n - 1,
                                       [],
                                       pad_left, pad_right,
                                       estimator,
                                       **estimator_kwargs)
        self._backoff_alphas = None
        self._model = None

        # Process training
        self._ngrams = set()
        self.outcomes = set()
        self._cfd = ConditionalFreqDist()


        # ===================
        # Check Docs
        # ===================
        # I think it's important that the model be able to train on a one-use generator
        # so that it can train on corpora that don't fit in RAM. This requires some robust
        # type-checking though.  What's below could use some improvement, but seems to work
        # for now.

        # TODO: test with CorpusView
        # Docs needs to be able to be a list, tuple, generator, or an iterable that yields
        # such (CorpusView?).

        # If given a list of strings instead of a list of lists, create enclosing list
        # NB: The Iterator type won't catch lists, or strings, but it will catch things
        # returned by functions in itertools
        if isinstance(docs, GeneratorType) or isinstance(docs, Iterator):
            nxt = docs.next()
            # Either it's a string or a list of string
            if isinstance(nxt, (basestring, int)):
                docs = [itertools.chain([nxt], docs)]
            elif isinstance(nxt, Sequence):
                # It should be a list of string...
                # FIXME: Handle generator here as well
                if isinstance(nxt[0], (basestring, int)):
                    # So docs is a generator that yields sequences of str
                    docs = itertools.chain([nxt], docs)
            else:
                raise TypeError("Training documents given to NgramModel are a generator "
                                "that yields something other than a string or a list of "
                                "string.  %s" % docs)
        # could also just be a sting
        elif isinstance(docs, basestring):
            raise TypeError("Training documents given to NgramModel must be either a list "
                            "of string or a list of lists of string.  Or a generator that "
                            "acts in the same way as one of the above.  A string was found "
                            "instead: %s" % docs)
        elif isinstance(docs, Sequence):
            # It's some kind of iterable with a __getitem__, not a generator
            # If it's empty, assume training will happen later
            if len(docs) == 0:
                pass
            elif isinstance(docs[0], (basestring, int)):
                # Make it into a list of lists
                docs = [docs]
            elif isinstance(docs[0], Sequence):
                # Check inner to make sure it's a string
                if not isinstance(docs[0][0], (basestring, int)):
                    raise TypeError("Training documents given to NgramModel were neither a "
                                    "list of string nor a list of list of string: %s" % docs)
                # If it is a string everything is fine, nothing to worry about
        else:
            raise TypeError("Unsupported type supplied to NgramModel for training documents: %s" %
                            docs)


        # Train the model
        for sent in docs:
            self.train(sent)

        # Build model and set the backoff parameters
        if len(self.outcomes) > 0:
            self._build_model(estimator, estimator_kwargs)



    # ===================
    # TRAINING
    # ===================

    # At every stage, in the backoff/recursion the number of bins for
    # the estimator should be equal to the total number of outcomes
    # (tokens) encountered while training.  This means that it needs
    # to be recalculated at each level of the recursion.
    # NB: For the unigram case, this would be the actual vocabulary size
    def train(self, sent):
        """
        Train this model and the backoff model on the given sentence.

        :param sent: A list of items to train on
        :type sent: list
        :return: None
        """
        # FIXME: This may use extra memory, but because python 2.7 doesn't
        # support deepcopy for generators, I'm not sure what else to do...
        if isinstance(sent, GeneratorType) or isinstance(sent, Iterator):
            s1, s2 = itertools.tee(sent, 2)
            self._train_one(s1)
            if self._backoff is not None:
                self._backoff.train(s2)
        else:
            self._train_one(sent)
            if self._backoff is not None:
                self._backoff.train(sent)


    # FIXME: Discard cfd after training?
    # Should check if the probdist keeps a reference to it
    def _train_one(self, sent):
        """Train the model on a sequence"""

        # print "TRAINING!"

        for ngram in ngrams(sent, self._n,
                            self._pad_left,
                            self._pad_right,
                            left_pad_symbol=self._lpad,
                            right_pad_symbol=self._rpad):
            self._ngrams.add(ngram)
            context = tuple(ngram[:-1])
            token = ngram[-1]
            self._cfd[context][token] += 1
            self.outcomes.add(token)


    # ===================
    # CREATE MODEL
    # ===================
    # NB: Even if the number of bins is explicitly passed, we should use the number
    # of word types encountered during training as the bins value.
    # If right padding is on, this includes the padding symbol.
    #
    # There is a good reason for this!  If the number of bins isn't set the
    # ConditionalProbDist will choose from a different total number of possible
    # outcomes for each condition and the NgramModel won't give probability
    # estimates that sum to 1.
    def _build_model(self, estimator, estimator_kwargs):
        """
        Construct the ``ConditionalProbDist`` used to estimate probabilities.

        This should only be called after the model has been trained. If
        additional training is performed, this should be called again.

        :param estimator: A callable that returns something that extends
            ProbDistI. Used to map the frequency of a condition to its
            probability. The only estimator that currently work are
            ``LidstoneProbDist`` and ``LaplaceProbDist``.
        :param estimator_kwargs: Additional arguments to pass to the estimator.
            If ``bins`` is in here it will be overridden.
        :return: None
        """
        n_outcomes = len(self.outcomes)
        if n_outcomes <= 0:
            raise RuntimeError("NgramModel can't build a model without training input!")

        estimator_kwargs['bins'] = n_outcomes

        # Create the probability model
        self._model = ConditionalProbDist(self._cfd, estimator, **estimator_kwargs)

        # Clear out the bins so we don't throw recursive warnings
        estimator_kwargs.pop('bins', None)

        # Build backoff model and get backoff parameters
        if not self._unigram_model:
            self._backoff._build_model(estimator, estimator_kwargs)
            self._set_backoff_params()


    # ===================
    # SET BACKOFF PARAMS
    # ===================
    def _set_backoff_params(self):
        """
        Sets the alphas for the backoff models used to calculate
        the probability for unseen ngrams.

        :return: None
        """
        # Construct parameters for
        if not self._unigram_model:
            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in self._cfd.conditions():
                prdist = self._model[ctxt]  # prob dist for this context

                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0
                for word in self._cfd[ctxt]:
                    # This is the subset of words that we OBSERVED
                    # following this context
                    total_observed_pr += prdist.prob(word)
                    # We normalize it by the total (n-1)-gram probability of
                    # words that were observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)

                assert (0 < total_observed_pr <= 1), total_observed_pr
                # beta is the remaining probability weight after we factor out
                # the probability of observed words
                beta = 1.0 - total_observed_pr

                # backoff total has to be less than one, otherwise we get
                # ZeroDivision error when we try subtracting it from 1 below
                assert (0 < backoff_total_pr < 1), backoff_total_pr
                alpha_ctxt = beta / (1.0 - backoff_total_pr)

                self._backoff_alphas[ctxt] = alpha_ctxt


    # ==================
    # API Methods
    # ==================

    # This is a new method (not in original nltk model)
    def prob_seq(self, seq):
        """
        Evaluate the probability of a sequence (list of tokens).

        The probability of a sequence is the product of the probabilities
        of all of the ngrams in that sequence.  Because these probabilities
        can be very small underflow errors are common with long sequences.
        In order to avoid underflows, a common approach is to do all of
        the calculations in (negative) log-space and take advantage of the
        properties of logs to transform the calculation into a sum. That
        is the approach used here.

        :param seq: A list of tokens representing a document/sentence/etc.
        :type seq: list(str)
        :return: The negative log probabilitiy of the given sequence
        :rtype: float
        """
        prob = 0.0
        for ngram in ngrams(seq, self._n, self._pad_left, self._pad_right,
                            left_pad_symbol=self._lpad,
                            right_pad_symbol=self._rpad):
            context = tuple(ngram[:-1])
            token = ngram[-1]
            prob += self.logprob(token, context)
        return prob

    def prob(self, word, context=()):
        """
        Evaluate the probability of this word in this context using Katz Backoff.

        :param word: the word to get the probability of
        :type word: str
        :param context: the context the word is in
        :type context: list(str)
        """
        context = tuple(context)
        if (context + (word,) in self._ngrams) or self._unigram_model:
            return self[context].prob(word)
        else:
            return self._alpha(context) * self._backoff.prob(word, context[1:])

    # Updated _alpha function, discarded the _beta function
    def _alpha(self, context):
        """Get the backoff alpha value for the given context
        """
        error_message = "Alphas and backoff are not defined for unigram models"
        assert not self._unigram_model, error_message

        if context in self._backoff_alphas:
            return self._backoff_alphas[context]
        else:
            return 1

    def logprob(self, word, context=()):
        """
        Evaluate the (negative) log probability of this word in this context.

        :param word: the word to get the probability of
        :type word: str
        :param context: the context the word is in
        :type context: list(str)
        """
        return -math.log(self.prob(word, context), 2)

    def choose_random_word(self, context):
        """
        Randomly select a word that is likely to appear in this context.

        :param context: the context the word is in
        :type context: list(str)|tuple(str)
        """
        return self.generate(1, context)[-1]

    # NB, this will always start with same word if the model
    # was trained on a single text
    def generate(self, num_words, context=()):
        """
        Generate random text based on the language model.

        :param num_words: number of words to generate
        :type num_words: int
        :param context: initial words in generated string
        :type context: list(str)|tuple(str)
        """
        text = list(context)
        for i in range(num_words):
            text.append(self._generate_one(text))
        return text

    def _generate_one(self, context):
        context = (self._lpad + tuple(context))[-self._n + 1:]

        if context in self:
            return self[context].generate()
        elif self._n > 1:
            return self._backoff._generate_one(context[1:])
        else:
            return '.'

    def entropy(self, text):
        """
        Calculate the approximate cross-entropy of the n-gram model for a
        given evaluation text.
        This is the average log probability of each word in the text.

        :param text: words to use for evaluation
        :type text: list(str)
        """
        H = 0.0     # entropy is conventionally denoted by "H"
        text = list(self._lpad) + text + list(self._rpad)
        for i in xrange(self._n - 1, len(text)):
            context = tuple(text[(i - self._n + 1):i])
            token = text[i]
            H += self.logprob(token, context)
        return H / float(len(text) - (self._n - 1))

    def perplexity(self, text):
        """
        Calculates the perplexity of the given text.
        This is simply 2 ** cross-entropy for the text.

        :param text: words to calculate perplexity of
        :type text: list(str)
        """
        return math.pow(2.0, self.entropy(text))

    def __contains__(self, item):
        if not isinstance(item, tuple):
            item = (item,)
        return item in self._model

    def __getitem__(self, item):
        if not isinstance(item, tuple):
            item = (item,)
        return self._model[item]

    def __repr__(self):
        return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
class MyNgramModel(NgramModel):
    """
    A processing interface for assigning a probability to the next word.
    """
    
    def __init__(self, n, train, pad_left=True, pad_right=False,estimator=None, *estimator_args, **estimator_kwargs):
        super(MyNgramModel,self).__init__(n,train,pad_left,pad_right,estimator,*estimator_args, **estimator_kwargs)
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))
        
        self._n = n
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        self._cfd = ConditionalFreqDist()
        self._ngrams = set()
        
            
        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], basestring):
            train = [train]

        for sent in train:
            for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                self._cfd[context].inc(token)

        if not estimator_args and not estimator_kwargs:
            self._model = ConditionalProbDist(self._cfd, estimator, len(self._cfd))
        else:
            self._model = ConditionalProbDist(self._cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        self._backoff = None
        if n > 1:
            self._backoff = MyNgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs)
        
            if self._backoff is not None:
                self._backoff_alphas = dict()
    
            # For each condition (or context)
                for ctxt in self._cfd.conditions():
                    pd = self._model[ctxt] # prob dist for this context
                    backoff_ctxt = ctxt[1:]
                    backoff_total_pr = 0
                    total_observed_pr = 0
                    for word in self._cfd[ctxt].keys(): # this is the subset of words that we OBSERVED                    
                        backoff_total_pr += self._backoff.prob(word,backoff_ctxt) 
                        total_observed_pr += pd.prob(word)        
                    assert total_observed_pr <= 1 and total_observed_pr > 0
                    assert backoff_total_pr <= 1 and backoff_total_pr > 0
                    alpha_ctxt = (1.0-total_observed_pr) / (1.0-backoff_total_pr)
        
                    self._backoff_alphas[ctxt] = alpha_ctxt
                   
# Updated _alpha function, discarded the _beta function
    def _alpha(self, tokens):
    
        if tokens in self._backoff_alphas:
            return self._backoff_alphas[tokens]
        else:
            return 1
Exemple #36
0
### get the (token,tag) pair for each tagged sentence
i = 1

for sentence in brown.tagged_sents():
	for (token, tag) in sentence:
		if i < 6:		
			print(token, tag)
		fd.inc(tag)
		cfd[token].inc(tag)
		i += 1

### the most frequent tag:
print fd.max()

wordbins = []
for token in cfd.conditions():
	wordbins.append((cfd[token].B(), token))


### sort tuples by number of unique tags
wordbins.sort(reverse=True)
print wordbins[0:3]


### masculine pronouns
male = ['he', 'his', 'him', 'himself']
female = ['she', 'hers', 'her', 'herself']
n_male, n_female = 0, 0 

for m in male: 
	n_male += cfd[m].N()
Exemple #37
0
    def __init__(self,
                 n,
                 train,
                 pad_left=True,
                 pad_right=False,
                 estimator=None,
                 **estimator_kwargs):
        """
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training. See model.doctest for more detailed testing

            >>> from nltk.corpus import brown
            >>> lm = NgramModel(3, brown.words(categories='news'))
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
            >>> lm.entropy(brown.words(categories='humor'))
            ... # doctest: +ELLIPSIS
            12.0399...

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str) or list(list(str))
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
        :type pad_right: bool
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert (isinstance(pad_left, bool))
        assert (isinstance(pad_right, bool))

        self._lpad = ('', ) * (n - 1) if pad_left else ()
        self._rpad = ('', ) * (n - 1) if pad_right else ()

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

        # For explicitness save the check whether this is a unigram model
        self.is_unigram_model = (n == 1)
        # save the ngram order number
        self._n = n
        # save left and right padding
        self._lpad = ('', ) * (n - 1) if pad_left else ()
        self._rpad = ('', ) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()

        # set read-only ngrams set (see property declaration below to reconfigure)
        self._ngrams = set()

        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], compat.string_types):
            train = [train]

        # we need to keep track of the number of word types we encounter
        vocabulary = set()
        for sent in train:
            raw_ngrams = ngrams(sent, n, pad_left, pad_right, pad_symbol='')
            for ngram in raw_ngrams:
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context][token] += 1
                vocabulary.add(token)

        # Unless number of bins is explicitly passed, we should use the number
        # of word types encountered during training as the bins value.
        # If right padding is on, this includes the padding symbol.
        if 'bins' not in estimator_kwargs:
            estimator_kwargs['bins'] = len(vocabulary)

        self._model = ConditionalProbDist(cfd, estimator, **estimator_kwargs)

        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n - 1, train, pad_left, pad_right,
                                       estimator, **estimator_kwargs)

            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0

                # this is the subset of words that we OBSERVED following
                # this context.
                # i.e. Count(word | context) > 0
                for words in self._words_following(ctxt, cfd):

                    # so, _words_following as fixed gives back a whole list now...
                    for word in words:

                        total_observed_pr += self.prob(word, ctxt)
                        # we also need the total (n-1)-gram probability of
                        # words observed in this n-gram context
                        backoff_total_pr += self._backoff.prob(
                            word, backoff_ctxt)

                assert (0 <= total_observed_pr <= 1), total_observed_pr
                # beta is the remaining probability weight after we factor out
                # the probability of observed words.
                # As a sanity check, both total_observed_pr and backoff_total_pr
                # must be GE 0, since probabilities are never negative
                beta = 1.0 - total_observed_pr

                # backoff total has to be less than one, otherwise we get
                # an error when we try subtracting it from 1 in the denominator
                assert (0 <= backoff_total_pr < 1), backoff_total_pr
                alpha_ctxt = beta / (1.0 - backoff_total_pr)

                self._backoff_alphas[ctxt] = alpha_ctxt
Exemple #38
0
    def __init__(self, n, train, k=5, v=None,
                 liveDangerously=False, quiet=False):
        """
        Creates an Katz-threshholded Ngram language model to capture
        patterns in n consecutive words of training text.
        Uses the KGoodTuringProbDist to estimate the conditional and unigram probabilities,
        to provide coverage of Ngrams not seen during training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param k: The threshhold above which counts are assumed
                  to be reliable.  Defaults to 5.
        @type  k: C{Int}
        @param v: The number of unseens of degree 1.  Defaults to the
                  number of types in the training set
        @type  v: C{Int}
        @param liveDangerously: If False, for each model check that
                                the total probability mass after all
                                adjustments is close to 1.  Defaults
                                to False.
        @type  liveDangerously: C{Boolean}
        @param quiet: Various information will be printed during model
                       construction unless this is True.  Defaults to False.
        @type  quiet: C{Boolean}
        """
        self._n = n
        self._N = 1 + len(train) - n
        fd = FreqDist(train)
        if v is None:
            v = fd.B()
        print(('v', v))
        if n == 1:
            # Treat this case specially
            self._model = KGoodTuringProbDist(fd, k, v, liveDangerously, ())
            if not quiet:
                print("%s entries for %s tokens at degree 1, %s" % (len(fd),
                                                                    fd.N(),
                                                                    self._model.status))
        else:
            def estimator(fdist, ctxt): return KGoodTuringProbDist(fdist, k, v,
                                                                   liveDangerously,
                                                                   ctxt)

            cfd = ConditionalFreqDist()

            for ngram in ingrams(train, n):
                # self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context].inc(token)

            self._model = ConditionalProbDist(cfd, estimator, True)
            if not quiet:
                statuses = {'normal': 0, 'bigSkewed': 0,
                            'weak': 0, LowHacked: 0}
                for ctx in cfd.conditions():
                    statuses[self[ctx].status] += 1
                print("%s conditions at degree %s" %
                      (len(cfd.conditions()), n))
                for s in list(statuses.keys()):
                    print(" %s %6d" % (s, statuses[s]))

            # recursively construct the lower-order models
            self._backoff = KBNgramModel(n - 1, train, k, v, liveDangerously)
    def __init__(self, n, train, pad_left=True, pad_right=False,
                 estimator=None, **estimator_kwargs):
        """
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training. See model.doctest for more detailed testing

            >>> from nltk.corpus import brown
            >>> lm = NgramModel(3, brown.words(categories='news'))
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
            >>> lm.entropy(brown.words(categories='humor'))
            ... # doctest: +ELLIPSIS
            12.0399...

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str) or list(list(str))
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
        :type pad_right: bool
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n
        self._unigram_model = (n == 1)
        self._n = n

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()

        self._ngrams = set()

        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], compat.string_types):
            train = [train]

        # we need to keep track of the number of word types we encounter
        vocabulary = set()
        for sent in train:
            for ngram in ngrams(sent, n, pad_left, pad_right, pad_symbol=''):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context][token] += 1
                vocabulary.add(token)

        # Unless number of bins is explicitly passed, we should use the number
        # of word types encountered during training as the bins value.
        # If right padding is on, this includes the padding symbol.
        if 'bins' not in estimator_kwargs:
            estimator_kwargs['bins'] = len(vocabulary)

        self._model = ConditionalProbDist(cfd, estimator, **estimator_kwargs)

        # recursively construct the lower-order models
        if not self._unigram_model:
            self._backoff = NgramModel(n-1, train,
                                        pad_left, pad_right,
                                        estimator,
                                        **estimator_kwargs)

            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                prdist = self._model[ctxt] # prob dist for this context

                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0
                for word in cfd[ctxt]:
                    # this is the subset of words that we OBSERVED
                    # following this context
                    total_observed_pr += prdist.prob(word)
                    # we normalize it by the total (n-1)-gram probability of
                    # words that were observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)

                assert (0.0 < total_observed_pr < 1.1), total_observed_pr
                # beta is the remaining probability weight after we factor out
                # the probability of observed words
                beta = 1.0 - total_observed_pr

                # backoff total has to be less than one, otherwise we get
                # ZeroDivision error when we try subtracting it from 1 below
                if backoff_total_pr >= 1.0:
                  backoff_total_pr = 0.99
                assert (0 < backoff_total_pr < 1.0), backoff_total_pr
                alpha_ctxt = beta / (1.0 - backoff_total_pr)

                self._backoff_alphas[ctxt] = alpha_ctxt
Exemple #40
0
class TnT(TaggerI):
    '''
    TnT - Statistical POS tagger

    IMPORTANT NOTES:

    * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS

      - It is possible to provide an untrained POS tagger to
        create tags for unknown words, see __init__ function

    * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT

      - Due to the nature of this tagger, it works best when
        trained over sentence delimited input.
      - However it still produces good results if the training
        data and testing data are separated on all punctuation eg: [,.?!]
      - Input for training is expected to be a list of sentences
        where each sentence is a list of (word, tag) tuples
      - Input for tag function is a single sentence
        Input for tagdata function is a list of sentences
        Output is of a similar form

    * Function provided to process text that is unsegmented

      - Please see basic_sent_chop()


    TnT uses a second order Markov model to produce tags for
    a sequence of input, specifically:

      argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)

    IE: the maximum projection of a set of probabilities

    The set of possible tags for a given word is derived
    from the training data. It is the set of all tags
    that exact word has been assigned.

    To speed up and get more precision, we can use log addition
    to instead multiplication, specifically:

      argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] +
             log(P(t_T+1|t_T))

    The probability of a tag for a given word is the linear
    interpolation of 3 markov models; a zero-order, first-order,
    and a second order model.

      P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +
                             l3*P(t_i| t_i-1, t_i-2)

    A beam search is used to limit the memory usage of the algorithm.
    The degree of the beam can be changed using N in the initialization.
    N represents the maximum number of possible solutions to maintain
    while tagging.

    It is possible to differentiate the tags which are assigned to
    capitalized words. However this does not result in a significant
    gain in the accuracy of the results.
    '''

    def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = FreqDist()
        self._bi   = ConditionalFreqDist()
        self._tri  = ConditionalFreqDist()
        self._wd   = ConditionalFreqDist()
        self._eos  = ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

    def train(self, data):
        '''
        Uses a set of tagged data to train the tagger.
        If an unknown word tagger is specified,
        it is trained on the same data.

        :param data: List of lists of (word, tag) tuples
        :type data: tuple(str)
        '''

        # Ensure that local C flag is initialized before use
        C = False

        if self._unk is not None and self._T == False:
            self._unk.train(data)

        for sent in data:
            history = [('BOS',False), ('BOS',False)]
            for w, t in sent:

                # if capitalization is requested,
                # and the word begins with a capital
                # set local flag C to True
                if self._C and w[0].isupper(): C=True

                self._wd[w].inc(t)
                self._uni.inc((t,C))
                self._bi[history[1]].inc((t,C))
                self._tri[tuple(history)].inc((t,C))

                history.append((t,C))
                history.pop(0)

                # set local flag C to false for the next word
                C = False

            self._eos[t].inc('EOS')


        # compute lambda values from the trained frequency distributions
        self._compute_lambda()

        #(debugging -- ignore or delete me)
        #print "lambdas"
        #print i, self._l1, i, self._l2, i, self._l3


    def _compute_lambda(self):
        '''
        creates lambda values based upon training data

        NOTE: no need to explicitly reference C,
        it is contained within the tag variable :: tag == (tag,C)

        for each tag trigram (t1, t2, t3)
        depending on the maximum value of
        - f(t1,t2,t3)-1 / f(t1,t2)-1
        - f(t2,t3)-1 / f(t2)-1
        - f(t3)-1 / N-1

        increment l3,l2, or l1 by f(t1,t2,t3)

        ISSUES -- Resolutions:
        if 2 values are equal, increment both lambda values
        by (f(t1,t2,t3) / 2)
        '''

        # temporary lambda variables
        tl1 = 0.0
        tl2 = 0.0
        tl3 = 0.0

        # for each t1,t2 in system
        for history in self._tri.conditions():
            (h1, h2) = history

            # for each t3 given t1,t2 in system
            # (NOTE: tag actually represents (tag,C))
            # However no effect within this function
            for tag in self._tri[history].samples():

                # if there has only been 1 occurrence of this tag in the data
                # then ignore this trigram.
                if self._uni[tag] == 1:
                    continue

                # safe_div provides a safe floating point division
                # it returns -1 if the denominator is 0
                c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1))
                c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1))
                c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1))


                # if c1 is the maximum value:
                if (c1 > c3) and (c1 > c2):
                    tl1 += self._tri[history][tag]

                # if c2 is the maximum value
                elif (c2 > c3) and (c2 > c1):
                    tl2 += self._tri[history][tag]

                # if c3 is the maximum value
                elif (c3 > c2) and (c3 > c1):
                    tl3 += self._tri[history][tag]

                # if c3, and c2 are equal and larger than c1
                elif (c3 == c2) and (c3 > c1):
                    tl2 += float(self._tri[history][tag]) /2.0
                    tl3 += float(self._tri[history][tag]) /2.0

                # if c1, and c2 are equal and larger than c3
                # this might be a dumb thing to do....(not sure yet)
                elif (c2 == c1) and (c1 > c3):
                    tl1 += float(self._tri[history][tag]) /2.0
                    tl2 += float(self._tri[history][tag]) /2.0

                # otherwise there might be a problem
                # eg: all values = 0
                else:
                    #print "Problem", c1, c2 ,c3
                    pass

        # Lambda normalisation:
        # ensures that l1+l2+l3 = 1
        self._l1 = tl1 / (tl1+tl2+tl3)
        self._l2 = tl2 / (tl1+tl2+tl3)
        self._l3 = tl3 / (tl1+tl2+tl3)



    def _safe_div(self, v1, v2):
        '''
        Safe floating point division function, does not allow division by 0
        returns -1 if the denominator is 0
        '''
        if v2 == 0:
            return -1
        else:
            return float(v1) / float(v2)

    def tagdata(self, data):
        '''
        Tags each sentence in a list of sentences

        :param data:list of list of words
        :type data: [[string,],]
        :return: list of list of (word, tag) tuples

        Invokes tag(sent) function for each sentence
        compiles the results into a list of tagged sentences
        each tagged sentence is a list of (word, tag) tuples
        '''
        res = []
        for sent in data:
            res1 = self.tag(sent)
            res.append(res1)
        return res


    def tag(self, data):
        '''
        Tags a single sentence

        :param data: list of words
        :type data: [string,]

        :return: [(word, tag),]

        Calls recursive function '_tagword'
        to produce a list of tags

        Associates the sequence of returned tags
        with the correct words in the input sequence

        returns a list of (word, tag) tuples
        '''

        current_state = [(['BOS', 'BOS'], 0.0)]

        sent = list(data)

        tags = self._tagword(sent, current_state)

        res = []
        for i in range(len(sent)):
            # unpack and discard the C flags
            (t,C) = tags[i+2]
            res.append((sent[i], t))

        return res


    def _tagword(self, sent, current_states):
        '''
        :param sent : List of words remaining in the sentence
        :type sent  : [word,]
        :param current_states : List of possible tag combinations for
                                the sentence so far, and the log probability
                                associated with each tag combination
        :type current_states  : [([tag, ], logprob), ]

        Tags the first word in the sentence and
        recursively tags the reminder of sentence

        Uses formula specified above to calculate the probability
        of a particular tag
        '''

        # if this word marks the end of the sentance,
        # return the most probable tag
        if sent == []:
            (h, logp) = current_states[0]
            return h

        # otherwise there are more words to be tagged
        word = sent[0]
        sent = sent[1:]
        new_states = []

        # if the Capitalisation is requested,
        # initalise the flag for this word
        C = False
        if self._C and word[0].isupper(): C=True

        # if word is known
        # compute the set of possible tags
        # and their associated log probabilities
        if word in self._wd.conditions():
            self.known += 1

            for (history, curr_sent_logprob) in current_states:
                logprobs = []

                for t in self._wd[word].samples():
                    p_uni = self._uni.freq((t,C))
                    p_bi = self._bi[history[-1]].freq((t,C))
                    p_tri = self._tri[tuple(history[-2:])].freq((t,C))
                    p_wd = float(self._wd[word][t])/float(self._uni[(t,C)])
                    p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri
                    p2 = log(p, 2) + log(p_wd, 2)

                    logprobs.append(((t,C), p2))


                # compute the result of appending each tag to this history
                for (tag, logprob) in logprobs:
                    new_states.append((history + [tag],
                                       curr_sent_logprob + logprob))




        # otherwise a new word, set of possible tags is unknown
        else:
            self.unknown += 1

            # since a set of possible tags,
            # and the probability of each specific tag
            # can not be returned from most classifiers:
            # specify that any unknown words are tagged with certainty
            p = 1

            # if no unknown word tagger has been specified
            # then use the tag 'Unk'
            if self._unk is None:
                tag = ('Unk',C)

            # otherwise apply the unknown word tagger
            else :
                [(_w, t)] = list(self._unk.tag([word]))
                tag = (t,C)

            for (history, logprob) in current_states:
                history.append(tag)

            new_states = current_states



        # now have computed a set of possible new_states

        # sort states by log prob
        # set is now ordered greatest to least log probability
        new_states.sort(reverse=True, key=itemgetter(1))

        # del everything after N (threshold)
        # this is the beam search cut
        if len(new_states) > self._N:
            new_states = new_states[:self._N]


        # compute the tags for the rest of the sentence
        # return the best list of tags for the sentence
        return self._tagword(sent, new_states)
Exemple #41
0
    def __init__(self, n, train, pad_left=True, pad_right=False,
                 estimator=None, **estimator_kwargs):
        """
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training. See model.doctest for more detailed testing
            >>> from nltk.corpus import brown
            >>> lm = NgramModel(3, brown.words(categories='news'))
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
            >>> lm.entropy(brown.words(categories='humor'))
            ... # doctest: +ELLIPSIS
            12.0399...
        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str) or list(list(str))
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
        :type pad_right: bool
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

        # For explicitness save the check whether this is a unigram model
        self.is_unigram_model = (n == 1)
        # save the ngram order number
        self._n = n
        # save left and right padding
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()

        # set read-only ngrams set (see property declaration below to reconfigure)
        self._ngrams = set()
        '''
        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], compat.string_types):
            train = [train]
        '''
        # we need to keep track of the number of word types we encounter
        vocabulary = set()
        count = 0
        #for review in train:
        for review in read_reviews(train):

            count += 1
            if count % 10000 == 0:
                print str(count) + ' reviews processed'

            #for testing with small training set
            #if count > 100000:
            #    break

            #newly added, each element is dict of each review
            review_text = review['text']

            #separate into tokens, lowercase
            tokens = word_tokenize(review_text)
            tokens = [w.lower() for w in tokens]

            #updated for new nltk api
            raw_ngrams = ngrams(tokens, n, pad_left, pad_right, left_pad_symbol='', right_pad_symbol='...EOR...')
            for ngram in raw_ngrams:
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context][token] += 1
                vocabulary.add(token)

        # Unless number of bins is explicitly passed, we should use the number
        # of word types encountered during training as the bins value.
        # If right padding is on, this includes the padding symbol.
        
        if 'bins' not in estimator_kwargs:
            estimator_kwargs['bins'] = len(vocabulary) * 2

        self._model = ConditionalProbDist(cfd, estimator, **estimator_kwargs)
        self._probdist = self._model
        
        
        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n-1, train,
                                        pad_left, pad_right,
                                        estimator,
                                        **estimator_kwargs)

            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0

                # this is the subset of words that we OBSERVED following
                # this context.
                # i.e. Count(word | context) > 0
                for word in self._words_following(ctxt, cfd):
                    total_observed_pr += self.prob(word, ctxt)
                    # we also need the total (n-1)-gram probability of
                    # words observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)

                assert (0 <= total_observed_pr <= 1), total_observed_pr
                # beta is the remaining probability weight after we factor out
                # the probability of observed words.
                # As a sanity check, both total_observed_pr and backoff_total_pr
                # must be GE 0, since probabilities are never negative
                beta = 1.0 - total_observed_pr

                # backoff total has to be less than one, otherwise we get
                # an error when we try subtracting it from 1 in the denominator
                assert (0 <= backoff_total_pr < 1), backoff_total_pr
                alpha_ctxt = beta / (1.0 - backoff_total_pr)

                self._backoff_alphas[ctxt] = alpha_ctxt
from nltk.probability import FreqDist
from nltk.probability import ConditionalFreqDist
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

testNegWords = movie_reviews.words(categories=['pos'])
testPosWords = movie_reviews.words(categories=['neg'])

for word in testNegWords:
    word_fd[word.lower()]+=1
    label_word_fd['neg'][word.lower()]+=1
for word in testPosWords:
    word_fd[word.lower()]+=1
    label_word_fd['pos'][word.lower()]+=1
print(word_fd.N(),word_fd.B(),word_fd.most_common(20))
print(label_word_fd.N(),label_word_fd.conditions(),label_word_fd.items())
print(label_word_fd['pos'].N(),label_word_fd['neg'].N())


# In[ ]:

# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
#         w1    ~w1
#      ------ ------
#  w2 | n_ii | n_oi | = n_xi
#      ------ ------
# ~w2 | n_io | n_oo |
#     ------ ------
Exemple #43
0
class TnT(TaggerI):
    """
    TnT - Statistical POS tagger

    IMPORTANT NOTES:

    * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS

      - It is possible to provide an untrained POS tagger to
        create tags for unknown words, see __init__ function

    * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT

      - Due to the nature of this tagger, it works best when
        trained over sentence delimited input.
      - However it still produces good results if the training
        data and testing data are separated on all punctuation eg: [,.?!]
      - Input for training is expected to be a list of sentences
        where each sentence is a list of (word, tag) tuples
      - Input for tag function is a single sentence
        Input for tagdata function is a list of sentences
        Output is of a similar form

    * Function provided to process text that is unsegmented

      - Please see basic_sent_chop()


    TnT uses a second order Markov model to produce tags for
    a sequence of input, specifically:

      argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)

    IE: the maximum projection of a set of probabilities

    The set of possible tags for a given word is derived
    from the training data. It is the set of all tags
    that exact word has been assigned.

    To speed up and get more precision, we can use log addition
    to instead multiplication, specifically:

      argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] +
             log(P(t_T+1|t_T))

    The probability of a tag for a given word is the linear
    interpolation of 3 markov models; a zero-order, first-order,
    and a second order model.

      P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +
                             l3*P(t_i| t_i-1, t_i-2)

    A beam search is used to limit the memory usage of the algorithm.
    The degree of the beam can be changed using N in the initialization.
    N represents the maximum number of possible solutions to maintain
    while tagging.

    It is possible to differentiate the tags which are assigned to
    capitalized words. However this does not result in a significant
    gain in the accuracy of the results.
    """
    def __init__(self, unk=None, Trained=False, N=1000, C=False):
        """
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk: TaggerI
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: bool
        :param N: Beam search degree (see above)
        :type  N: int
        :param C: Capitalization flag
        :type  C: bool

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        """

        self._uni = FreqDist()
        self._bi = ConditionalFreqDist()
        self._tri = ConditionalFreqDist()
        self._wd = ConditionalFreqDist()
        self._eos = ConditionalFreqDist()
        self._l1 = 0.0
        self._l2 = 0.0
        self._l3 = 0.0
        self._N = N
        self._C = C
        self._T = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

    def train(self, data):
        """
        Uses a set of tagged data to train the tagger.
        If an unknown word tagger is specified,
        it is trained on the same data.

        :param data: List of lists of (word, tag) tuples
        :type data: tuple(str)
        """

        # Ensure that local C flag is initialized before use
        C = False

        if self._unk is not None and self._T == False:
            self._unk.train(data)

        for sent in data:
            history = [("BOS", False), ("BOS", False)]
            for w, t in sent:

                # if capitalization is requested,
                # and the word begins with a capital
                # set local flag C to True
                if self._C and w[0].isupper():
                    C = True

                self._wd[w][t] += 1
                self._uni[(t, C)] += 1
                self._bi[history[1]][(t, C)] += 1
                self._tri[tuple(history)][(t, C)] += 1

                history.append((t, C))
                history.pop(0)

                # set local flag C to false for the next word
                C = False

            self._eos[t]["EOS"] += 1

        # compute lambda values from the trained frequency distributions
        self._compute_lambda()

    def _compute_lambda(self):
        """
        creates lambda values based upon training data

        NOTE: no need to explicitly reference C,
        it is contained within the tag variable :: tag == (tag,C)

        for each tag trigram (t1, t2, t3)
        depending on the maximum value of
        - f(t1,t2,t3)-1 / f(t1,t2)-1
        - f(t2,t3)-1 / f(t2)-1
        - f(t3)-1 / N-1

        increment l3,l2, or l1 by f(t1,t2,t3)

        ISSUES -- Resolutions:
        if 2 values are equal, increment both lambda values
        by (f(t1,t2,t3) / 2)
        """

        # temporary lambda variables
        tl1 = 0.0
        tl2 = 0.0
        tl3 = 0.0

        # for each t1,t2 in system
        for history in self._tri.conditions():
            (h1, h2) = history

            # for each t3 given t1,t2 in system
            # (NOTE: tag actually represents (tag,C))
            # However no effect within this function
            for tag in self._tri[history].keys():

                # if there has only been 1 occurrence of this tag in the data
                # then ignore this trigram.
                if self._uni[tag] == 1:
                    continue

                # safe_div provides a safe floating point division
                # it returns -1 if the denominator is 0
                c3 = self._safe_div((self._tri[history][tag] - 1),
                                    (self._tri[history].N() - 1))
                c2 = self._safe_div((self._bi[h2][tag] - 1),
                                    (self._bi[h2].N() - 1))
                c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1))

                # if c1 is the maximum value:
                if (c1 > c3) and (c1 > c2):
                    tl1 += self._tri[history][tag]

                # if c2 is the maximum value
                elif (c2 > c3) and (c2 > c1):
                    tl2 += self._tri[history][tag]

                # if c3 is the maximum value
                elif (c3 > c2) and (c3 > c1):
                    tl3 += self._tri[history][tag]

                # if c3, and c2 are equal and larger than c1
                elif (c3 == c2) and (c3 > c1):
                    tl2 += self._tri[history][tag] / 2.0
                    tl3 += self._tri[history][tag] / 2.0

                # if c1, and c2 are equal and larger than c3
                # this might be a dumb thing to do....(not sure yet)
                elif (c2 == c1) and (c1 > c3):
                    tl1 += self._tri[history][tag] / 2.0
                    tl2 += self._tri[history][tag] / 2.0

                # otherwise there might be a problem
                # eg: all values = 0
                else:
                    pass

        # Lambda normalisation:
        # ensures that l1+l2+l3 = 1
        self._l1 = tl1 / (tl1 + tl2 + tl3)
        self._l2 = tl2 / (tl1 + tl2 + tl3)
        self._l3 = tl3 / (tl1 + tl2 + tl3)

    def _safe_div(self, v1, v2):
        """
        Safe floating point division function, does not allow division by 0
        returns -1 if the denominator is 0
        """
        if v2 == 0:
            return -1
        else:
            return v1 / v2

    def tagdata(self, data):
        """
        Tags each sentence in a list of sentences

        :param data:list of list of words
        :type data: [[string,],]
        :return: list of list of (word, tag) tuples

        Invokes tag(sent) function for each sentence
        compiles the results into a list of tagged sentences
        each tagged sentence is a list of (word, tag) tuples
        """
        res = []
        for sent in data:
            res1 = self.tag(sent)
            res.append(res1)
        return res

    def tag(self, data):
        """
        Tags a single sentence

        :param data: list of words
        :type data: [string,]

        :return: [(word, tag),]

        Calls recursive function '_tagword'
        to produce a list of tags

        Associates the sequence of returned tags
        with the correct words in the input sequence

        returns a list of (word, tag) tuples
        """

        current_state = [(["BOS", "BOS"], 0.0)]

        sent = list(data)

        tags = self._tagword(sent, current_state)

        res = []
        for i in range(len(sent)):
            # unpack and discard the C flags
            (t, C) = tags[i + 2]
            res.append((sent[i], t))

        return res

    def _tagword(self, sent, current_states):
        """
        :param sent : List of words remaining in the sentence
        :type sent  : [word,]
        :param current_states : List of possible tag combinations for
                                the sentence so far, and the log probability
                                associated with each tag combination
        :type current_states  : [([tag, ], logprob), ]

        Tags the first word in the sentence and
        recursively tags the reminder of sentence

        Uses formula specified above to calculate the probability
        of a particular tag
        """

        # if this word marks the end of the sentence,
        # return the most probable tag
        if sent == []:
            (h, logp) = current_states[0]
            return h

        # otherwise there are more words to be tagged
        word = sent[0]
        sent = sent[1:]
        new_states = []

        # if the Capitalisation is requested,
        # initialise the flag for this word
        C = False
        if self._C and word[0].isupper():
            C = True

        # if word is known
        # compute the set of possible tags
        # and their associated log probabilities
        if word in self._wd:
            self.known += 1

            for (history, curr_sent_logprob) in current_states:
                logprobs = []

                for t in self._wd[word].keys():
                    tC = (t, C)
                    p_uni = self._uni.freq(tC)
                    p_bi = self._bi[history[-1]].freq(tC)
                    p_tri = self._tri[tuple(history[-2:])].freq(tC)
                    p_wd = self._wd[word][t] / self._uni[tC]
                    p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri
                    p2 = log(p, 2) + log(p_wd, 2)

                    # compute the result of appending each tag to this history
                    new_states.append((history + [tC], curr_sent_logprob + p2))

        # otherwise a new word, set of possible tags is unknown
        else:
            self.unknown += 1

            # since a set of possible tags,
            # and the probability of each specific tag
            # can not be returned from most classifiers:
            # specify that any unknown words are tagged with certainty
            p = 1

            # if no unknown word tagger has been specified
            # then use the tag 'Unk'
            if self._unk is None:
                tag = ("Unk", C)

            # otherwise apply the unknown word tagger
            else:
                [(_w, t)] = list(self._unk.tag([word]))
                tag = (t, C)

            for (history, logprob) in current_states:
                history.append(tag)

            new_states = current_states

        # now have computed a set of possible new_states

        # sort states by log prob
        # set is now ordered greatest to least log probability
        new_states.sort(reverse=True, key=itemgetter(1))

        # del everything after N (threshold)
        # this is the beam search cut
        if len(new_states) > self._N:
            new_states = new_states[:self._N]

        # compute the tags for the rest of the sentence
        # return the best list of tags for the sentence
        return self._tagword(sent, new_states)
Exemple #44
0
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
#
for (feats, label) in fvecs:
  #print label
  for key in feats:
    #print key
    if feats[key]:
      word_fd.inc(key)
      #print word_fd
      label_word_fd[label].inc(key)
      #print label_word_fd[label]
#
##print word_fd['positive']
##print label_word_fd      
print label_word_fd.conditions()
cls_set=label_word_fd.conditions()
#
#
pos_word_count = label_word_fd['positive'].N()
print "positive word count: " + str(pos_word_count)
neg_word_count = label_word_fd['negative'].N()
print "negative word count: " + str(neg_word_count)
total_word_count = pos_word_count + neg_word_count
print "totl word count: " + str(total_word_count)
#
feature_scores = {}

for feature, freq in word_fd.iteritems():
  #print feature, freq
  pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][feature],
Exemple #45
0
				word = unicode(word)	
				cfd[prev_word][word]+=1
				global_fd[word] += 1
				prev_word = word
	except:
		print "falhou um link..."

print "terminou distrib probabilidades"

print "vai construir as listas com o formato pro SQLite"
global_frequencies = []
for word in sorted(global_fd.keys()):
	global_frequencies.append((word, global_fd[word]))

conditional_frequencies = []
for condition in sorted(cfd.conditions()):
	for word in sorted(cfd[condition].keys()):
		if condition:
			conditional_frequencies.append((condition, word, cfd[condition][word]))
	

print "vai comecar a parte do banco de dados"
con = lite.connect("words-pt.db")		

with con:
	cur = con.cursor()
	cur.execute("DROP TABLE IF EXISTS _1_gram")
	cur.execute("CREATE TABLE _1_gram(word TEXT, count INT)")
	cur.executemany("INSERT INTO _1_gram VALUES(?, ?)", tuple(global_frequencies))

	cur.execute("DROP TABLE IF EXISTS _2_gram")
Exemple #46
0
    def __init__(self, n, train, pad_left=False, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        :param n: the order of the language model (ngram size)
        :type n: C{int}
        :param train: the training text
        :type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string} 
        :param estimator: a function for generating a probability distribution---defaults to MLEProbDist
        :type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s>
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with </s>
        :type pad_right: bool
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

        # For explicitness save the check whether this is a unigram model
        self.is_unigram_model = (n == 1)
        # save the ngram order number
        self._n = n
        # save left and right padding
        self._lpad = ('<s>',) * (n - 1) if pad_left else ()
        # Need _rpad even for unigrams or padded entropy will give
        #  wrong answer because '</s>' will be treated as unseen...
        self._rpad = ('</s>',) if pad_right else ()
        self._padLen = len(self._lpad)+len(self._rpad)

        self._N=0
        delta = 1+self._padLen-n        # len(sent)+delta == ngrams in sent

        if estimator is None:
            assert (estimator_args is ()) and (estimator_kwargs=={}),\
                   "estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs)
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # Given backoff, a generator isn't acceptable
        if not isinstance(train,collections.abc.Sequence):
          train=list(train)
        self._W = len(train)
        # Coerce to list of list -- note that this means to train charGrams,
        #  requires exploding the words ahead of time 
        if train is not None:
            if isinstance(train[0], compat.string_types):
                train = [train]
                self._W=1
            elif not isinstance(train[0],collections.abc.Sequence):
                # if you mix strings and generators, you have only yourself
                #  to blame!
                for i in range(len(train)):
                    train[i]=list(train[i])

        if n == 1:
            if pad_right:
                sents=(chain(s,self._rpad) for s in train)
            else:
                sents=train
            fd=FreqDist()
            for s in sents:
                fd.update(s)
            if not estimator_args and not estimator_kwargs:
                self._model = estimator(fd,fd.B())
            else:
                self._model = estimator(fd,fd.B(),
                                        *estimator_args, **estimator_kwargs)
            self._N=fd.N()
        else:
            cfd = ConditionalFreqDist()
            self._ngrams = set()

            for sent in train:
                self._N+=len(sent)+delta
                for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                    self._ngrams.add(ngram)
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
                    cfd[context][token]+=1
            if not estimator_args and not estimator_kwargs:
                self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            else:
                self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n-1, train,
                                        pad_left, pad_right,
                                        estimator,
                                        *estimator_args,
                                        **estimator_kwargs)

            # Code below here in this method, and the _words_following and _alpha method, are from
            # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015"
            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0

                # this is the subset of words that we OBSERVED following
                # this context.
                # i.e. Count(word | context) > 0
                for word in self._words_following(ctxt, cfd):
                    total_observed_pr += self.prob(word, ctxt)
                    # we also need the total (n-1)-gram probability of
                    # words observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
                if isclose(total_observed_pr,1.0):
                    total_observed_pr=1.0
                else:
                    assert 0.0 <= total_observed_pr <= 1.0,\
                           "sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr)
                # beta is the remaining probability weight after we factor out
                # the probability of observed words.
                # As a sanity check, both total_observed_pr and backoff_total_pr
                # must be GE 0, since probabilities are never negative
                beta = 1.0 - total_observed_pr

                if beta!=0.0:
                    assert (0.0 <= backoff_total_pr < 1.0), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = beta / (1.0 - backoff_total_pr)
                else:
                    assert ((0.0 <= backoff_total_pr < 1.0) or
                            isclose(1.0,backoff_total_pr)), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = 0.0

                self._backoff_alphas[ctxt] = alpha_ctxt