コード例 #1
0
    def cDist(self, params):
        """return conditional freq distribution (based on part of speech) using filtered_words from loadData"""

        president = params["president"]
        speech = params["speech"]

        if self.president == "All presidents":
            pipeline = [{"$match": {"type": speech}}, {"$project": {"tags": "$filtered_speech_tags"}}]
        else:
            pipeline = [
                {"$match": {"name": president, "type": speech}},
                {"$project": {"tags": "$filtered_speech_tags"}},
            ]

        tags = []
        for i in self.col.aggregate(pipeline):
            tags.extend(i["tags"])

        cfdist = ConditionalFreqDist()  # conditioned on pos_tag
        for word, tag in tags:
            condition = tag  # specify condition to group frequencies by
            cfdist[condition][word] += 1

        VB = MLEProbDist(cfdist.get("VBP"))
        NN = MLEProbDist(cfdist.get("NN"))
        JJ = MLEProbDist(cfdist.get("JJ"))

        return VB, NN, JJ  # return verbs, nouns, adjectives
コード例 #2
0
ファイル: agreement.py プロジェクト: DevilDante88/MyCogs
 def Ae_kappa(self, cA, cB):
     Ae = 0.0
     nitems = float(len(self.I))
     label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
     for k in label_freqs.conditions():
         Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
     return Ae
コード例 #3
0
   def _setSelectedPOSTags(self):

      buff = self._loadData('selective_pos.bin')

      if buff:
         self.selective_pos = buff
         return

      #First get all (word, tag) in corpuses
      sentences = brown.tagged_sents(simplify_tags=True)
      self.selected_tags = ["ADJ","ADV", "CNJ"]
      self.selective_pos = ConditionalFreqDist()
      temp_dist = ConditionalFreqDist()
      for sentence in sentences:
         for (word, tag) in sentence:
            if tag in self.selected_tags:
               temp_dist[tag].inc(str(word).lower())

      #Now, get the words with frequency > 10
      for category in temp_dist.conditions():
         fredist = temp_dist[category]
         for key in fredist.keys():
            if fredist[key] > 4:
               self.selective_pos[category].inc(key)

      self._saveData('selective_pos.bin',self.selective_pos)
コード例 #4
0
ファイル: Sys_Params.py プロジェクト: Saher-/SATC
def high_information_words(labeled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    """
    To eliminate low information feature words for set of words for EFFICIENCY
    :param labeled_words: list of 2 tuples [(label, words)]
                          label -> is a classification label (pos / neg)
                          words -> is a list of words that occur under that label
    :param score_fn: a scoring function to measure how informative that word is
    :param min_score: the minimum score for a word to be included as MOST INFORMATIVE WORD
    :return: a set of high informative words
    """
    print "Counting Word Frequencies"
    word_fq = FreqDist()
    labeled_word_fq = ConditionalFreqDist()

    for label, words in labeled_words:
        for word in words:
            word_fq[word] += 1
            labeled_word_fq[label][word] += 1
    n_xx = labeled_word_fq.N()
    high_info_words = set()

    for label in labeled_word_fq.conditions():
        n_xi = labeled_word_fq[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in labeled_word_fq[label].iteritems():
            n_ix = word_fq[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
        high_info_words |= set(bestwords)

    return high_info_words
コード例 #5
0
def readFormatedData(formatedData):
    #unigramFd = FreqDist()
    #bigramFd = FreqDist()
    cBigramFd1 = ConditionalFreqDist()
    cBigramFd2 = ConditionalFreqDist()
    #dict1 = Set([])
    #dict2 = Set([])
    for tuple in formatedData:
        words = tuple[0].split(' ')
        count = int(tuple[1])
        #unigramFd.inc(words[0])
        #unigramFd.inc(words[1])
        #bigramFd.inc((words[0], words[1]), count)
        word2 = words[1]
        if count < 5:
            word2 = "unknown"
        cBigramFd1[words[0]].inc(word2, count)
        #if words[0] not in dict1:
        #    dict1.add(words[0])
        #if words[1] not in dict2:
        #    dict2.add(words[1])
    for w1 in cBigramFd1.conditions():
        bigram_w1 = cBigramFd1[w1]
        for w2 in bigram_w1.samples():
            cBigramFd2[w2].inc(w1, bigram_w1[w2])
    return cBigramFd1, cBigramFd2#, dict1, dict2
コード例 #6
0
ファイル: text_utils.py プロジェクト: fruser/review-analyzer
def get_high_information_words(lwords, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    labels = lwords.keys()
    labelled_words = [(l, lwords[l]) for l in labels]
    word_freq_dist = FreqDist()
    label_word_freq_dist = ConditionalFreqDist()

    for label, dwords in labelled_words:
        for words in dwords:
            for word in words:
                word_freq_dist[word] += 1
                label_word_freq_dist[label][word] += 1

    n_words_total = label_word_freq_dist.N()
    high_info_words = set()

    for label in label_word_freq_dist.conditions():
        n_words_label = label_word_freq_dist[label].N()
        word_scores = defaultdict(int)

        for word, word_freq_label in label_word_freq_dist[label].items():
            word_freq = word_freq_dist[word]
            score = score_fn(word_freq_label, (word_freq, n_words_label), n_words_total)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)
    return high_info_words
コード例 #7
0
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()
	
	for label, words in labelled_words:
		for word in words:
			word_fd.inc(word)
			label_word_fd[label].inc(word)
	
	n_xx = label_word_fd.N()
	high_info_words = set()
	
	for label in label_word_fd.conditions():
		n_xi = label_word_fd[label].N()
		word_scores = collections.defaultdict(int)
		
		for word, n_ii in label_word_fd[label].iteritems():
			n_ix = word_fd[word]
			score = score_fn(n_ii, (n_ix, n_xi), n_xx)
			word_scores[word] = score
		
		bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
		high_info_words |= set(bestwords)
	
	return high_info_words
コード例 #8
0
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                        tag != self.backoff.tag_one(
                        tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0) / token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained Unigram tagger:", end=' ')
            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning))
	def __init__(self, r, name, cond_samples=None):
		self._r = r
		self._name = name
		ConditionalFreqDist.__init__(self, cond_samples)
		# initialize self._fdists for all matching keys
		for key in self._r.keys(encode_key('%s:*' % name)):
			condition = key.split(':')[1]
			self[condition] # calls self.__getitem__(condition)
コード例 #10
0
ファイル: redisprob.py プロジェクト: ShunyuanZ/nltk3-cookbook
	def __init__(self, r, name, cond_samples=None):
		self._r = r
		self._name = name
		ConditionalFreqDist.__init__(self, cond_samples)
		
		for key in self._r.keys(encode_key('%s:*' % name)):
			condition = key.split(b':')[1].decode()
			self[condition] # calls self.__getitem__(condition)
コード例 #11
0
ファイル: languageModel.py プロジェクト: slee17/NLP
def words_by_followers(category):
    """Given a category from the brown corpus, lowercases everything,
    and returns a frequency distribution where the keys are words
    and the counts are the number of different contexts that each word can appear in."""
    bigrams = brown_bigrams(category)
    cfdist = ConditionalFreqDist((bigram[1], bigram[0]) for bigram in bigrams)
    fdist = FreqDist()
    for context in cfdist.keys():
        fdist[context] = len(cfdist[context])
    return fdist
コード例 #12
0
ファイル: q2.py プロジェクト: atiassa/recommend-2011
def _train(self, tagged_corpus, cutoff=0, verbose=False): 
    token_count = hit_count = 0 
    useful_contexts = set() 
    fd = ConditionalFreqDist() 
    tag_prob = FreqDist()
    for sentence in tagged_corpus: 
        tokens, tags = zip(*sentence) 
        for index, (token, tag) in enumerate(sentence): 
            # Record the event. 
            token_count += 1 
            tag_prob.inc(tag)
            context = self.context(tokens, index, tags[:index])
            if context is None: continue 
            fd[context].inc(tag) 
            # If the backoff got it wrong, this context is useful: 
            if (self.backoff is None or 
                tag != self.backoff.tag_one(tokens, index, tags[:index])): 
                useful_contexts.add(context) 
    # Build the context_to_tag table -- for each context,  
    # calculate the entropy.  Only include contexts that 
    # lower then `cutoff` .
    total_tags = float(sum(tag_prob.values()))
    tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
    useful_contexts_after_filter = useful_contexts.copy()
    most_high = FreqDist()
    for context in useful_contexts:
        dd = fd[context]
#        total_tags = float(sum(dd.values()))
#        tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
        h = self.H(dd.keys(),tags_probs)
        if h > cutoff:
            useful_contexts_after_filter.remove(context)
            continue
        most_high[context] = h
    print most_high.keys()
    # Build the context_to_tag table -- for each context, figure
    # out what the most likely tag is.  
    for context in useful_contexts_after_filter:
        best_tag = fd[context].max()
        hits = fd[context][best_tag]
        self._context_to_tag[context] = best_tag
        hit_count += hits
    # Display some stats, if requested. 
    if verbose: 
        size = len(self._context_to_tag) 
        backoff = 100 - (hit_count * 100.0)/ token_count 
        pruning = 100 - (size * 100.0) / len(fd.conditions()) 
        print "[Trained Unigram tagger:", 
        print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
コード例 #13
0
ファイル: tnt.py プロジェクト: Arttii/TextBlob
    def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = FreqDist()
        self._bi   = ConditionalFreqDist()
        self._tri  = ConditionalFreqDist()
        self._wd   = ConditionalFreqDist()
        self._eos  = ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0
コード例 #14
0
ファイル: predictor.py プロジェクト: drewatk/textPredictor
    def __init__(self, load_from_disk=True):
        self._corpus = reuters.words()

        self._unigram_fd = FreqDist()
        self._bigram_cfd = ConditionalFreqDist()
        self._trigram_cfd = ConditionalFreqDist()
        self._quadgram_cfd = ConditionalFreqDist()

        self._unigram_pd = None
        self._bigram_cpd = None
        self._trigram_cpd = None
        self._quadgram_cpd = None

        if load_from_disk:
            self._load_models()
        else:
            self._train()
コード例 #15
0
ファイル: q2_1.py プロジェクト: haozhuoran1991/recommend-2011
def validate_pcfg_generate(grammar):
    pd = makeLhrProbDict(grammar)
    productions = []
    cfd = ConditionalFreqDist()

    for i in np.arange(1000):
        tree = pcfg_generate(grammar)
        productions += tree.productions()

    for p in productions:
        cfd[p.lhs()].inc(p.rhs())

    for c in cfd.conditions():
        p = MLEProbDist(cfd[c])
        q = pd[c]
        div = KL_Divergence(p, q)
        print "KL_Divergence for %s = %f" % (c, div)
コード例 #16
0
ファイル: tagger.py プロジェクト: 0623forbidden/nltk4russian
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        """
        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            #best_tag = fd[context].max()
            for (tag, hits) in fd[context].items():
                if hits > cutoff:
                    self._contexts_to_tags[context] = self._contexts_to_tags.get(context, {})
                    self._contexts_to_tags[context][tag] = hits
                    hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)
コード例 #17
0
def jieba_words_feature(num=2400):  # num为特征维度
    # print('结巴分词')
    pos_words = []
    neg_words = []
    # 将分词完成后的词语分类存进集合
    for words in read_text('zl_pos.txt'):
        for word in words:
            pos_words.append(word)
    for words in read_text('zl_neg.txt'):
        for word in words:
            neg_words.append(word)

    # 用FreqDist来表示单词的整体频率,ConditionalFreqDist的条件是类别标签
    word_f = FreqDist()  # FreDist()构建出一个词为key,词频为value,按词频由大到小排列
    both_word_f = ConditionalFreqDist()
    for word in pos_words:
        word_f[word] += 1
        both_word_f['pos'][word] += 1
        # print('pos:', word_f[word])
    # print(both_word_f.N())
    for word in neg_words:
        word_f[word] += 1
        both_word_f['neg'][word] += 1
        # print('neg:', word_f[word])
    # print(word_f.items())
    # print(both_word_f.N())

    pos_words_num = both_word_f['pos'].N()
    neg_words_num = both_word_f['neg'].N()
    words_num = pos_words_num + neg_words_num

    # 用BigramAssocMeasures.chi_sq函数(卡方)为词汇计算评分,然后按分数排序,放入一个集合里
    word_scores = {}
    for word, freq in word_f.items():
        pos_score = BigramAssocMeasures.chi_sq(both_word_f['pos'][word],
                                               (freq, pos_words_num),
                                               words_num)
        # print('pos:', pos_score)
        neg_score = BigramAssocMeasures.chi_sq(both_word_f['neg'][word],
                                               (freq, neg_words_num),
                                               words_num)
        word_scores[word] = pos_score + neg_score  # 该词语总信息量

    best_vals = sorted(word_scores.items(),
                       key=lambda item: item[1],
                       reverse=True)[:num]  # 倒叙排序
    best_words = set([w for w, s in best_vals])
    print(best_words)

    h = open('zl_best_words.txt', 'w+', encoding='utf-8')
    h.write(str(best_words))
    h.close()

    # print(dict([(word, True) for word in best_words]))
    return dict([(word, True) for word in best_words])
コード例 #18
0
    def __init__(self,
                 n,
                 train,
                 estimator=None,
                 *estimator_args,
                 **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        @param estimator_args: Extra arguments for C{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying C{ConditionalFreqDist} are passed to
            the estimator as an argument.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for C{estimator}.
        @type estimator_kw_args: (any)
        """
        self._n = n

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ('', ) * (n - 1)

        for ngram in ingrams(chain(self._prefix, train), n):
            self._ngrams.add(ngram)
            context = tuple(ngram[:-1])
            token = ngram[-1]
            cfd[context].inc(token)

        if (not estimator_args) and (not estimator_kw_args):
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        else:
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args,
                                              **estimator_kw_args)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator,
                                       *estimator_args, **estimator_kw_args)
コード例 #19
0
    def bigramAnalysis(self):

        label_word_fd = ConditionalFreqDist()
        word_fd = FreqDist()

        datafiles = [
            {
                'emo': "Sad",
                'name': "/negative.csv"
            }, {
                'emo': "Happy",
                'name': "/positive.csv"
            }
            # , {'emo': 'Happy', 'name': "/trust.csv"}, {'emo': 'Sad', 'name': "/anger.csv"}
        ]

        for value in datafiles:
            emo = value['emo']
            name = value['name']
            read = self.readFile(name)
            normalized_sentences = [s.lower() for s in read['tweets']]

            for statement in normalized_sentences:
                for word in statement.split():
                    wor = word.lower()
                    if word not in stopset:
                        word_fd[word] += 1
                        label_word_fd[emo][word] += 1
                        # word_fd.inc(word.lower())

        word_scores = {}
        pos_word_count = label_word_fd['Happy'].N()
        neg_word_count = label_word_fd['Sad'].N()

        total_word_count = word_fd.N()

        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(
                label_word_fd['Happy'][word], (freq, pos_word_count),
                total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(label_word_fd['Sad'][word],
                                                   (freq, neg_word_count),
                                                   total_word_count)
            word_scores[word] = pos_score + neg_score

        best = sorted(word_scores.iteritems(),
                      key=lambda (w, s): s,
                      reverse=True)[:500]
        self.bestwords = set([w for w, s in best])

        print("\n\nevaluating best word features")
        self.unigramAnalysis(self.best_word_feats)

        print("\n\nBigram + bigram chi_sq word ")
        self.unigramAnalysis(self.best_bigram_word_feats)
コード例 #20
0
ファイル: decipher.py プロジェクト: emulhall/COMP550
def train_transitions(labelled_sequences,
                      additional_transitions,
                      estimator=None):
    # default to the MLE estimate
    if estimator is None:
        estimator = lambda fdist, bins: MLEProbDist(fdist)

    # count occurrences of starting states, transitions out of each state
    # and output symbols observed in each state
    known_symbols = []
    known_states = []

    starting = FreqDist()
    transitions = ConditionalFreqDist()
    outputs = ConditionalFreqDist()
    for sequence in labelled_sequences:
        lasts = None
        for token in sequence:
            state = token[0]
            symbol = token[1]
            if lasts is None:
                starting[state] += 1
            else:
                transitions[lasts][state] += 1
            outputs[state][symbol] += 1
            lasts = state

            # update the state and symbol lists
            if state not in known_states:
                known_states.append(state)

            if symbol not in known_symbols:
                known_symbols.append(symbol)

    # create probability distributions (with smoothing)
    N = len(known_states)
    pi = estimator(starting, N)
    A = ConditionalProbDist(
        ConditionalFreqDist.__add__(transitions, additional_transitions),
        estimator, N)
    B = ConditionalProbDist(outputs, estimator, len(known_symbols))
    return hmm.HiddenMarkovModelTagger(known_states, known_symbols, A, B, pi)
コード例 #21
0
ファイル: decipher.py プロジェクト: emulhall/COMP550
def transitionProb(sentences):
    transitions = ConditionalFreqDist()
    for sent in sentences:
        lasts = None
        for token in sent:
            if lasts is None:
                pass
            else:
                transitions[lasts][token] += 1
            lasts = token
    return transitions
コード例 #22
0
ファイル: my_ngrams.py プロジェクト: namdinh95/ngrams-model
def makeTrigram(corpus):
    '''For trigram'''
    corpus = startEndTag(corpus)
    trigram = ConditionalFreqDist()
    context = END_LINE + '$%' + START_LINE
    for sentence in corpus:
        for word in sentence:
            if word != START_LINE:
                trigram[context][word] += 1
            context = context[context.find('$%') + 2:] + '$%' + word
    return trigram
コード例 #23
0
    def __init__(self, n, train, pad_left=True, pad_right=False,estimator=None, *estimator_args, **estimator_kwargs):
        super(MyNgramModel,self).__init__(n,train,pad_left,pad_right,estimator,*estimator_args, **estimator_kwargs)
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))
        
        self._n = n
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        self._cfd = ConditionalFreqDist()
        self._ngrams = set()
        
            
        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], basestring):
            train = [train]

        for sent in train:
            for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                self._cfd[context].inc(token)

        if not estimator_args and not estimator_kwargs:
            self._model = ConditionalProbDist(self._cfd, estimator, len(self._cfd))
        else:
            self._model = ConditionalProbDist(self._cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        self._backoff = None
        if n > 1:
            self._backoff = MyNgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs)
        
            if self._backoff is not None:
                self._backoff_alphas = dict()
    
            # For each condition (or context)
                for ctxt in self._cfd.conditions():
                    pd = self._model[ctxt] # prob dist for this context
                    backoff_ctxt = ctxt[1:]
                    backoff_total_pr = 0
                    total_observed_pr = 0
                    for word in self._cfd[ctxt].keys(): # this is the subset of words that we OBSERVED                    
                        backoff_total_pr += self._backoff.prob(word,backoff_ctxt) 
                        total_observed_pr += pd.prob(word)        
                    assert total_observed_pr <= 1 and total_observed_pr > 0
                    assert backoff_total_pr <= 1 and backoff_total_pr > 0
                    alpha_ctxt = (1.0-total_observed_pr) / (1.0-backoff_total_pr)
        
                    self._backoff_alphas[ctxt] = alpha_ctxt
コード例 #24
0
def laplace_stuff():
    sent = "am ate ate apple am x."
    sent_tokenized = word_tokenize(sent)
    freq_dist = FreqDist(word.lower() for word in word_tokenize(sent))
    print(freq_dist.items())
    lap = probability.LaplaceProbDist(freq_dist)
    print(lap.generate())
    print(lap.prob("am"))
    print("Finished freq dist, Starting Cond dist")
    # Cond Probabilty
    cond_dist = ConditionalFreqDist()
    context = None
    tokens = sent_tokenized
    # The type of the preceeding word
    for token in tokens:
        outcome = token
        cond_dist[context] = (outcome)
        context = token
    print(cond_dist["am"])
    print(cond_dist.items())
コード例 #25
0
ファイル: serialization.py プロジェクト: xhendyagsx/indosum
def _dump_cpdist(cpdist: ConditionalProbDist) -> dict:
    cfdist = ConditionalFreqDist()
    for cond in cpdist.conditions():
        for k, v in cpdist[cond].freqdist().items():
            cfdist[cond][k] += v

    return {
        'cfdist': cfdist,
        'factory_args': cpdist._factory_args,
        'factory_kw_args': cpdist._factory_kw_args,
    }
コード例 #26
0
    def train(
        cls,
        docs: Collection[Document],
        gamma_word: float = 0.1,
        gamma_init: float = 0.1,
        gamma_trans: float = 0.1,
        tf_table: Optional[Mapping[Word, float]] = None,
    ) -> 'HMMSummarizer':
        """Train the model on a collection of documents.

        Args:
            docs (Collection[Document]): The collection of documents to train on.
            gamma_word (float): Smoothing value for the "word probability in a document"
                feature.
            gamma_init (float): Smoothing value for the initial probability.
            gamma_trans (float): Smoothing value for the transition probability.
            tf_table (Mapping[Word, float]): A precomputed term-frequency table that is already
                normalized.

        Returns:
            HMM: The trained model.
        """
        init_fdist = FreqDist()
        trans_fdist = ConditionalFreqDist()
        tagged_vecs: list = []
        states = set()

        for doc in docs:
            tags = cls._get_tags(doc.sentences)
            if not tags:
                continue

            init_fdist[tags[0]] += 1
            for prev, tag in zip(tags, tags[1:]):
                trans_fdist[prev][tag] += 1
            vecs = cls._get_feature_vectors(doc, gamma_word, tf=tf_table)
            tagged_vecs.extend(zip(vecs, tags))
            states.update(tags)

        # Initial probability
        init_pdist = LidstoneProbDist(init_fdist, gamma_init, bins=len(states))
        # Transition probability
        trans_pdist = ConditionalProbDist(trans_fdist,
                                          LidstoneProbDist,
                                          gamma_trans,
                                          bins=len(states))
        # Emission probability
        emit_pdist = _GaussianEmission.train(tagged_vecs)
        return cls(init_pdist,
                   trans_pdist,
                   emit_pdist,
                   list(states),
                   gamma=gamma_word,
                   tf_table=tf_table)
コード例 #27
0
ファイル: features_bigram.py プロジェクト: bromjiri/Presto
    def create_bestbigrams(self):
        word_fd = FreqDist()
        label_word_fd = ConditionalFreqDist()
        score_fn = BigramAssocMeasures.chi_sq

        cut = int((self.total / 2) * 3 / 4)

        for unigrams in self.unigrams_pos[:cut]:
            bigram_finder = BigramCollocationFinder.from_words(unigrams)
            try:
                bigrams = bigram_finder.nbest(score_fn, self.bigram_count)
            except:
                continue
            for word in bigrams:
                word_fd[word] += 1
                label_word_fd['pos'][word] += 1

        for unigrams in self.unigrams_neg[:cut]:
            bigram_finder = BigramCollocationFinder.from_words(unigrams)
            try:
                bigrams = bigram_finder.nbest(score_fn, self.bigram_count)
            except:
                continue
            for word in bigrams:
                word_fd[word] += 1
                label_word_fd['neg'][word] += 1

        pos_word_count = label_word_fd['pos'].N()
        neg_word_count = label_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count
        word_scores = {}

        for word, freq in word_fd.items():
            pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                                   (freq, pos_word_count),
                                                   total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                                   (freq, neg_word_count),
                                                   total_word_count)
            word_scores[word] = pos_score + neg_score

        inf_limit = round(len(word_scores.items()) * self.inf_count)
        # print("inf_count:" + str(self.inf_count))
        # print("total: " + str(len(word_scores.items())))
        # print("limit: " + str(inf_limit))

        best = sorted(word_scores.items(),
                      key=lambda tup: tup[1],
                      reverse=True)[:1000]
        print(best)
        bestwords = set([w for w, s in best])
        print(bestwords)
        print(len(bestwords))
        self.bestbigrams = bestwords
コード例 #28
0
 def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
     self._key = key
     self._tokens = tokens
     if not context_func:
         self._context_func = self._default_context
     if filter:
         tokens = [t for t in tokens if filter(t)]
     self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
                                  for i, w in enumerate(tokens))
     self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
                                  for i, w in enumerate(tokens))
コード例 #29
0
 def __init__(self, n, words, start_symbol="<$>", end_symbol="</$>", pad_left=True, pad_right=False, estimator=ml_estimator):
     assert (n > 0)
     self._n=n
     self._words=words
     self._counter=ConditionalFreqDist()
     self._start_symbol=start_symbol
     self._end_symbol=end_symbol
     self._pad_left=pad_left
     self._pad_right=pad_right
     self._train()
     super().__init__(self._counter, estimator)
コード例 #30
0
def create_word_scores():
    # creates lists of all positive and negative words
    posWords = []
    negWords = []
    conWords = []
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords.append(posWord)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords.append(negWord)
    with open(RT_POLARITY_CON_FILE, 'r') as conSentences:
        for i in conSentences:
            conWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            conWords.append(conWord)
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))
    conWords = list(itertools.chain(*conWords))

    # build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1
    for word in conWords:
        word_fd[word.lower()] += 1
        cond_word_fd['con'][word.lower()] += 1
    # finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    con_word_count = cond_word_fd['con'].N()
    total_word_count = pos_word_count + neg_word_count + con_word_count

    # builds dictionary of word scores based on chi-squared test
    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        con_score = BigramAssocMeasures.chi_sq(cond_word_fd['con'][word],
                                               (freq, con_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score + con_score

    return word_scores
コード例 #31
0
ファイル: hmm_with_mini_relmin.py プロジェクト: finsqm/MInf
    def doesnt_work(self, y):
        """
		Code adapted from NLTK implementation of supervised training in HMMs
		"""

        estimator = lambda fdist, bins: MLEProbDist(fdist)

        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in y:
            lasts = None
            for state in sequence:
                if lasts is not None:
                    transitions[lasts][state] += 1
                lasts = state

        N = self.number_of_states + 2
        model = ConditionalProbDist(transitions, estimator, N)

        return model
コード例 #32
0
def create_word_scores():
    # creates lists of all positive and negative words
    posWords = []
    negWords = []
      
    sentences = read_in_tweets(twitter_data)
    random.shuffle(sentences)
    sentences = sentences[:100000]
    
    posSentences = []
    negSentences = []
    for tup in sentences:
        if tup[0]=='0':
            negSentences.append(tup[1])
        if tup[0]=='4':
            posSentences.append(tup[1])
    
   
    for i in posSentences:
        posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        posWords.append(posWord)

    for i in negSentences:
        negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        negWords.append(negWord)
        
    
    
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    # build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd.inc(word.lower())
        cond_word_fd['pos'].inc(word.lower())
    for word in negWords:
        word_fd.inc(word.lower())
        cond_word_fd['neg'].inc(word.lower())

    # finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    # builds dictionary of word scores based on chi-squared test
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
コード例 #33
0
ファイル: cfd.py プロジェクト: tocubed/imitare
    def commit(self):
        cfd = self._journal
        self._journal = ConditionalFreqDist()

        for cond in cfd.keys():
            olddist = self[cond]
            cfd[cond] += olddist

        self._sql.executemany('insert or replace into cfd(k, v) values (?, ?)',
                              ((cond, freqdist)
                               for cond, freqdist in cfd.items()))
コード例 #34
0
def get_repk_count(ngram_count):
    counts = ConditionalFreqDist()
    total_counts = defaultdict(int)
    for ngram, count in ngram_count.items():
        context = len(ngram) - 1
        total_counts[context] += count
        if len(ngram) == 1:
            continue
        if ngram[0] == ngram[-1]:
            counts[context][ngram[-1]] += count
    return counts, total_counts
コード例 #35
0
ファイル: scoring.py プロジェクト: Herka/nltk-trainer
def sum_category_word_scores(categorized_words, score_fn):
	word_fd = FreqDist()
	category_word_fd = ConditionalFreqDist()
	
	for category, words in categorized_words:
		for word in words:
			word_fd.inc(word)
			category_word_fd[category].inc(word)
	
	scores = collections.defaultdict(int)
	n_xx = category_word_fd.N()
	
	for category in category_word_fd.conditions():
		n_xi = category_word_fd[category].N()
		
		for word, n_ii in iteritems(category_word_fd[category]):
			n_ix = word_fd[word]
			scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
	
	return scores
コード例 #36
0
ファイル: scoring.py プロジェクト: nikicc/slovene-nltk-tagger
def sum_category_word_scores(categorized_words, score_fn):
    word_fd = FreqDist()
    category_word_fd = ConditionalFreqDist()

    for category, words in categorized_words:
        for word in words:
            word_fd.inc(word)
            category_word_fd[category].inc(word)

    scores = collections.defaultdict(int)
    n_xx = category_word_fd.N()

    for category in category_word_fd.conditions():
        n_xi = category_word_fd[category].N()

        for word, n_ii in category_word_fd[category].iteritems():
            n_ix = word_fd[word]
            scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)

    return scores
コード例 #37
0
class hmm:
    def __init__(self, name=0, tag=0):
        self.name = name
        self.tag = tag

        self.wsj = nltk.corpus.brown.tagged_words()
        self.sentences = nltk.corpus.brown.sents()
        #self.cfdTagAll = ConditionalFreqDist(tag, word for (word, tag) in self.wsj)

    def findTags(self, mostCommon=5):
        if self.tag != 0:
            self.tag_prefix = self.tag
            self.cfdTag = ConditionalFreqDist(
                (tag, word) for (word, tag) in self.wsj
                if tag.startswith(self.tag_prefix))
            return dict((tag, self.cfdTag[tag].most_common(mostCommon))
                        for tag in self.cfdTag.conditions())
        else:
            print("invalid method")

    def findAllTags(self, mostCommon=5):
        self.cfdTagAll = ConditionalFreqDist(
            (tag, word) for (word, tag) in self.wsj)
        for tag in sorted(self.cfdTagAll):
            print(tag, self.cfdTagAll[tag].most_common())
            #print(self.cfdTagAll)
        return dict(self.cfdTagAll)

    def findBigrams(self):
        self.bigram = bigrams([tag for word, tag in self.wsj])
        return self.bigram

    def biFrekvens(self, mostCommon=5):
        self.cfdBigram = ConditionalFreqDist(self.bigram)
        return dict((tag, self.cfdBigram[tag].most_common(mostCommon))
                    for tag in self.cfdBigram)

    def findName(self, mostCommon=5):
        if self.name != 0:
            self.cfdName = ConditionalFreqDist(
                (word.lower(), tag) for (word, tag) in self.wsj)
            return [self.name, self.cfdName[self.name].most_common(mostCommon)]
        else:
            print("invalid method")

    def findCPD(self, typecfd=None):
        if (typecfd == None):
            self.cpdTag = nltk.ConditionalProbDist(self.cfdTag,
                                                   nltk.MLEProbDist)
            return self.cpdTag
        elif (typecfd == "bi"):
            return ConditionalProbDist(self.cfdBigram, nltk.MLEProbDist)
        else:
            print("invalid method")
コード例 #38
0
def significantWords(untagged_docs, min_chisq=5, ratio=0.75):
    """ 
    Use chisq test of bigram contingency table to measure 
    the association of token with its sentiment

    Parameters
    ----------
    untagged_docs: list of tuples (words, tag)
    min_chisq: lower bound of significant
    ratio: pos/neg ratio, used to determine the sentiment of a word

    Returns
    -------
    significant_words: a 3-key-dict of words set

    """ 
    significant_words = collections.defaultdict(set)
    freq_dist = FreqDist()
    label_freq_dist = ConditionalFreqDist()
    stopping_words = set(nltk.corpus.stopwords.words('english'))
    for tokens, label in untagged_docs:
        for token in tokens:
            if token.isalpha() and not (token in stopping_words):
                freq_dist.inc(token)
                label_freq_dist[label].inc(token)
    n_xx = label_freq_dist.N()
    #pdb.set_trace()
    for label in label_freq_dist.conditions():
        for word, n_ii in label_freq_dist[label].iteritems():
            n_xi = label_freq_dist[label].N()
            n_ix = freq_dist[word]
            n_oi = n_xi-n_ii
            n_io = n_ix-n_ii
            n_oo = n_xx-n_oi-n_io-n_ii
            chisq = float(n_xx*(n_ii*n_oo - n_io*n_oi)**2)\
                    /((n_ii+n_io)*(n_ii+n_oi)*(n_oo+n_io)*(n_oo+n_oi))
            if chisq > min_chisq and n_ii>10:
                significant_words['total'] |= set([word])
                if float(n_ii)/n_ix > ratio and (n_ix-n_ii) > 1:
                    significant_words[label] |= set([word])
    return significant_words
コード例 #39
0
ファイル: decompose.py プロジェクト: sindhuula/MT_2016
def dependencybigram(n, lms, wds, trs):
    estimator = lidstone_estimator
    cfd = ConditionalFreqDist()
    for lm, wd, tr in izip(lms, wds, trs):
        for bgram in dep_bigram(n, lm, wd, tr):
            _DPNGRAMS.add(bgram)
            context = bgram[:-1]
            token = bgram[-1]
            cfd[context][token] += 1
    _DPMODEL = ConditionalProbDist(cfd, estimator, len(cfd))
    if n > 1:
        _DPBACKOFF = dependencybigram(n - 1, lms, wds, trs)
コード例 #40
0
def save_MEMM(duilians, v_size):
    bigram = []
    for duilian in duilians:
        shanglian = duilian[0]
        xialian = duilian[1]
        bigram += [((shang_duiying, xia_qian), xia_hou)
                   for shang_duiying, xia_qian, xia_hou in zip(
                       shanglian[1:], xialian, xialian[1:])]
    ngram = ConditionalProbDist(ConditionalFreqDist(bigram), ELEProbDist,
                                v_size)
    with open(MEMM_save_dir + 'memm.pkl', 'wb') as f:
        pickle.dump(ngram, f)
コード例 #41
0
    def _make_models(self, tuples):
        self._word_ids = WordIdDictionary()

        # Extract sequence of words, lemmas, and tags
        words, lemmas, tags = tuple(
            map(
                lambda tokens: list(self._word_ids.add_words_transform(tokens)
                                    ), zip(*tuples)))
        self._tags = tags

        # Create models for words, lemmas, and tags
        self._words_ngram = NgramModel(words, self._n)
        self._lemmas_ngram = NgramModel(lemmas, self._n)
        self._tags_ngram = NgramModel(
            tags, 2 * self._n)  # Can afford to use 2 * n-gram size for grammar

        # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively
        # It's faster to use a list than predicate on unigrams during backoff search
        self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas))
        self._tag_lemma_words = ConditionalFreqDist(
            zip(zip(tags, lemmas), words))
コード例 #42
0
def LearnCondDist(Corpus, TrainSet, CFDist=None, saveParses=0, verbose=0):
    if not CFDist: CFDist = ConditionalFreqDist()
    for item in TrainSet:
        for sent in Corpus.readDirectionTree(item, saveParses, verbose):
            CountSubTree(CFDist, sent['TREE'])
    # Make 'S' the top of the tree


##    for cat in ('TRAVEL','TURN','DESC','NAME'):
##        for s in CFDist[cat].samples(): CFDist['S'].inc(s,CFDist[cat].count(s))
    print '\nRead in training tokens for Conditional Frequency:', CFDist
    return CFDist
コード例 #43
0
def process_bigrams(conn, polarity, total_word_count, best_words):
    cursor = conn.cursor()
    sql = Statements.GRAM_SQL % polarity
    cursor.execute(sql)

    rows = list(cursor.fetchall())
    l = [x[0] for x in rows]
    words_split = map(string.split, l)
    raw_words = [item for sublist in words_split for item in sublist]

    words = []
    for w in raw_words:
        if not (w.startswith("http://") or w.startswith("@")):
            words.append(w)

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in words:
        word_fd.inc(word.lower())
        label_word_fd[polarity].inc(word.lower())

    pos_word_count = label_word_fd[polarity].N()

    word_scores = {}

    for word, freq in word_fd.iteritems():
        score = BigramAssocMeasures.chi_sq(label_word_fd[polarity][word],
                                           (freq, pos_word_count),
                                           total_word_count)
        word_scores[word] = score

    best_raw = sorted(word_scores.iteritems(),
                      key=lambda (w, s): s,
                      reverse=True)[:600]
    best = [x[0] for x in best_raw if x[0] not in STOPWORDS and len(x[0]) > 1]
    best_words.update(best)
    best_features = features(best, polarity)

    bigram_finder = BigramCollocationFinder.from_words(words)
    bigram_finder.apply_freq_filter(4)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.pmi, 10)

    bigram_list = []
    for bt in bigrams:
        x = "%s %s" % (bt[0].lower(), bt[1].lower())
        bigram_list.append(x)

    bigram_features = features(bigram_list, polarity)
    best_features += bigram_features
    return best_features
    cursor.close()
コード例 #44
0
ファイル: ensemble.py プロジェクト: gymnosophist/pharr_format
    def _train(self, tagged_corpus: list, cutoff: int = 0, verbose: bool = False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag) tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        :param verbose: Not used
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if self.backoff is None or tag != self.backoff.tag_one(
                    tokens, index, tags[:index]
                ):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max() # Remove
            weighted_tags = [(k, v/sum(fd[context].values())) for k, v in fd[context].items()]
            hits = fd[context][best_tag] #INT
            if hits > cutoff:
                self._context_to_tag[context] = weighted_tags
                hit_count += hits
コード例 #45
0
ファイル: lm.py プロジェクト: rayruu/inf1820
    def __init__(self):
        self.bigrams = ConditionalFreqDist()
        self.unigrams = FreqDist()
        sentences = nltk.corpus.brown.sents(
            categories=nltk.corpus.brown.categories()[1:])

        for sent in sentences:
            # Vi utvider setningen med None foran, for å angi start av
            # setningen, og en None etter, for å markere setningsslutt.
            sent = [None] + sent + [None]
            for prev, word in bigrams(sent):
                self.bigrams[prev][word] += 1
                self.unigrams[word] += 1

        self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist)
        self.unigrams = LaplaceProbDist(self.unigrams)

        ############################# modified lm ####################################
        # regular expression:
        self.patterns = [
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # simple past
            (r'.*es$', 'VBZ'),  # 3rd singular present
            (r'.*ould$', 'MD'),  # modals
            (r'.*\'s$', 'NN$'),  # possessive nouns
            (r'.*s$', 'NNS'),  # plural nouns
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'.*', 'NN')  # nouns (default)
        ]
        # regular expression (modified):
        self.patternsModified = [
            (r'(.*able|.*ish|.*ible)$', 'JJ'),  # adjectives              # 1
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles                # 2
            (r'(a|an|my|some|the)$', 'DT'),  # determinative           # 3
            (r'(our|its|his|their|my|your|her|out|thy|mine|thine)$',
             'PP$'),  # determinative possesive # 4   
            (r'(.*ily|.*ly)$', 'ADV'),  # adverb                  # 5
            (r'(at|in|of|over|with)$', 'PP'),  # preposition             # 6
            (r'(and|because|but|if|or)$',
             'CNJ'),  # conjuction              # 7
            (r'([\.?!;:]+)$', '.'),  # sentence terminator     # 8
            (r'(\,)$', ','),  # comma                   # 9                    
            (r'(\-)$', '-'),  # dash                    # 10
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # simple past
            (r'.*es$', 'VBZ'),  # 3rd singular present
            (r'.*ould$', 'MD'),  # modals
            (r'.*\'s$', 'NN$'),  # possessive nouns
            (r'.*s$', 'NNS'),  # plural nouns
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'.*', 'NN')  # nouns (default)
        ]
コード例 #46
0
ファイル: hmm3.py プロジェクト: peterfeifanchen/PMP
 def __init__(self,
              trigram_freq,
              alpha1=0.85,
              alpha2=0.1,
              alpha3=0.05,
              bigram_freq=ConditionalFreqDist(),
              unigram_freq=FreqDist()):
     self.alpha1 = alpha1
     self.alpha2 = alpha2
     self.alpha3 = alpha3
     self.trifreqdist = trigram_freq
     self.bifreqdist = bigram_freq
     self.unifreqdist = unigram_freq
コード例 #47
0
ファイル: my_ngrams.py プロジェクト: namdinh95/ngrams-model
def makeBigram(corpus):
    ''' Use a conditional frequency distribution table
    to store bigram model
    @return: a bigram model '''
    corpus = startEndTag(corpus)
    bigram = ConditionalFreqDist()
    context = ''
    for sentence in corpus:
        for word in sentence:
            if word != START_LINE:
                bigram[context][word] += 1
            context = word
    return bigram
コード例 #48
0
def sum_category_word_scores(categorized_words, score_fn):
    # get word freq
    word_fd = FreqDist()
    # get conditional freq Dist
    category_word_fd = ConditionalFreqDist()
    # according to catagory
    for category, words in categorized_words:
        for word in words:
            word_fd.inc(word)
            category_word_fd[category].inc(word)

    scores = collections.defaultdict(int)
    n_xx = category_word_fd.N()

    for category in category_word_fd.conditions():
        n_xi = category_word_fd[category].N()

        for word, n_ii in category_word_fd[category].iteritems():
            n_ix = word_fd[word]
            scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
            # return the scores
    return scores
コード例 #49
0
 def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
     self._key = key
     self._tokens = tokens
     if context_func:
         self._context_func = context_func
     else:
         self._context_func = self._default_context
     if filter:
         tokens = [t for t in tokens if filter(t)]
     self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
                                  for i, w in enumerate(tokens))
     self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
                                  for i, w in enumerate(tokens))
コード例 #50
0
    def __init__(self, corpus, n, estimator=None):
        if estimator is None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)
        bi = []
        self._l = []
        for tree in corpus[:n]:
            ts = tree.leaves()
            sent = ['START'] + ts
            bi += nltk.bigrams(sent)
            self._l.append(len(sent))

        cfd = ConditionalFreqDist(bi)
        self._model = ConditionalProbDist(cfd, estimator, len(cfd))
コード例 #51
0
ファイル: assignment2.py プロジェクト: anonymous1025/AIT690
def generateModel(MergedText, ngramModel, numSentences):
    '''
    This function discards the sentences where number of words are less than n
	Generates nGram Model.
    '''
    vocabulary = set(MergedText)

    MergedText = delete_short(
        MergedText, ngramModel
    )  # delete the words from short sentence where words amount is less than n

    nngrams = boundaries(list(ngrams(
        MergedText, ngramModel)))  #generate the nngrams without cross boundary

    cfd = ConditionalFreqDist()
    ngramSet = set()
    vocabularyOfWords = set()
    fdist = FreqDist()

    ProbDictionary = defaultdict(list)

    #Generate conditional frequency distribution
    for ngram in nngrams:
        ngramSet.add(ngram)
        initial_text = tuple(
            ngram[:-1])  #this is the initial_text from the ngram (n-1)
        last_word = ngram[-1]  #this is the last word from ngram
        cfd[initial_text][last_word] += 1

        #Smoothing and generating probabilities using Laplace Algorithm
        #laplace_prob = [1.0 * (1+cfd[initial_text][last_word]) / (len(vocabulary)+cfd[initial_text].N())]
        ProbDictionary[initial_text].append(
            last_word)  #Storing probability of each word
        vocabularyOfWords.add(last_word)

    proDic = {
    }  #this is the probabilty stored for each words after each ngrams
    for key, value in ProbDictionary.items():
        words = set(value)
        proDic[key] = {}
        sum_freq = 0
        for word in words:
            fre_word = value.count(word) / float(
                len(word))  #compute the count of each words for a gram
            proDic[key][word] = fre_word
            sum_freq += fre_word
        for key_, value_ in proDic[key].items():
            proDic[key][
                key_] = value_ / sum_freq  #normalization of the propability
    #generate sentences
    generateSentences(nngrams, cfd, ngramModel, proDic, numSentences)
コード例 #52
0
    def __init__(self,
                 beam=1000,
                 max_guess=20,
                 rare_treshold=10,
                 capitalization=True):
        self._uni = FreqDist()
        self._bi = ConditionalFreqDist()
        self._tri = ConditionalFreqDist()

        self._wd = ConditionalFreqDist()

        self._l1 = 0.0
        self._l2 = 0.0
        self._l3 = 0.0

        self._beam_size = beam
        self._use_capitalization = capitalization
        self._max_guess = max_guess
        self._treshold = rare_treshold

        self._unk = Guesser(10)
        self._analyzer = None
        self.cache = {}
コード例 #53
0
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    # gathers the most frequently occuring features to improve classification
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1
            
    n_xx = label_word_fd.N()
    high_info_words = set()
    
    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)
        
    for word, n_ii in label_word_fd[label].items():
        n_ix = word_fd[word]
        score = score_fn(n_ii, (n_ix, n_xi), n_xx)
        word_scores[word] = score
        
    bestwords = [word for word, score in word_scores.items() if score>= min_score]
    high_info_words |= set(bestwords)
    return high_info_words
コード例 #54
0
ファイル: tnt.py プロジェクト: Arttii/TextBlob
class TnT(TaggerI):
    '''
    TnT - Statistical POS tagger

    IMPORTANT NOTES:

    * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS

      - It is possible to provide an untrained POS tagger to
        create tags for unknown words, see __init__ function

    * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT

      - Due to the nature of this tagger, it works best when
        trained over sentence delimited input.
      - However it still produces good results if the training
        data and testing data are separated on all punctuation eg: [,.?!]
      - Input for training is expected to be a list of sentences
        where each sentence is a list of (word, tag) tuples
      - Input for tag function is a single sentence
        Input for tagdata function is a list of sentences
        Output is of a similar form

    * Function provided to process text that is unsegmented

      - Please see basic_sent_chop()


    TnT uses a second order Markov model to produce tags for
    a sequence of input, specifically:

      argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)

    IE: the maximum projection of a set of probabilities

    The set of possible tags for a given word is derived
    from the training data. It is the set of all tags
    that exact word has been assigned.

    To speed up and get more precision, we can use log addition
    to instead multiplication, specifically:

      argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] +
             log(P(t_T+1|t_T))

    The probability of a tag for a given word is the linear
    interpolation of 3 markov models; a zero-order, first-order,
    and a second order model.

      P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +
                             l3*P(t_i| t_i-1, t_i-2)

    A beam search is used to limit the memory usage of the algorithm.
    The degree of the beam can be changed using N in the initialization.
    N represents the maximum number of possible solutions to maintain
    while tagging.

    It is possible to differentiate the tags which are assigned to
    capitalized words. However this does not result in a significant
    gain in the accuracy of the results.
    '''

    def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = FreqDist()
        self._bi   = ConditionalFreqDist()
        self._tri  = ConditionalFreqDist()
        self._wd   = ConditionalFreqDist()
        self._eos  = ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

    def train(self, data):
        '''
        Uses a set of tagged data to train the tagger.
        If an unknown word tagger is specified,
        it is trained on the same data.

        :param data: List of lists of (word, tag) tuples
        :type data: tuple(str)
        '''

        # Ensure that local C flag is initialized before use
        C = False

        if self._unk is not None and self._T == False:
            self._unk.train(data)

        for sent in data:
            history = [('BOS',False), ('BOS',False)]
            for w, t in sent:

                # if capitalization is requested,
                # and the word begins with a capital
                # set local flag C to True
                if self._C and w[0].isupper(): C=True

                self._wd[w].inc(t)
                self._uni.inc((t,C))
                self._bi[history[1]].inc((t,C))
                self._tri[tuple(history)].inc((t,C))

                history.append((t,C))
                history.pop(0)

                # set local flag C to false for the next word
                C = False

            self._eos[t].inc('EOS')


        # compute lambda values from the trained frequency distributions
        self._compute_lambda()

        #(debugging -- ignore or delete me)
        #print "lambdas"
        #print i, self._l1, i, self._l2, i, self._l3


    def _compute_lambda(self):
        '''
        creates lambda values based upon training data

        NOTE: no need to explicitly reference C,
        it is contained within the tag variable :: tag == (tag,C)

        for each tag trigram (t1, t2, t3)
        depending on the maximum value of
        - f(t1,t2,t3)-1 / f(t1,t2)-1
        - f(t2,t3)-1 / f(t2)-1
        - f(t3)-1 / N-1

        increment l3,l2, or l1 by f(t1,t2,t3)

        ISSUES -- Resolutions:
        if 2 values are equal, increment both lambda values
        by (f(t1,t2,t3) / 2)
        '''

        # temporary lambda variables
        tl1 = 0.0
        tl2 = 0.0
        tl3 = 0.0

        # for each t1,t2 in system
        for history in self._tri.conditions():
            (h1, h2) = history

            # for each t3 given t1,t2 in system
            # (NOTE: tag actually represents (tag,C))
            # However no effect within this function
            for tag in self._tri[history].samples():

                # if there has only been 1 occurrence of this tag in the data
                # then ignore this trigram.
                if self._uni[tag] == 1:
                    continue

                # safe_div provides a safe floating point division
                # it returns -1 if the denominator is 0
                c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1))
                c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1))
                c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1))


                # if c1 is the maximum value:
                if (c1 > c3) and (c1 > c2):
                    tl1 += self._tri[history][tag]

                # if c2 is the maximum value
                elif (c2 > c3) and (c2 > c1):
                    tl2 += self._tri[history][tag]

                # if c3 is the maximum value
                elif (c3 > c2) and (c3 > c1):
                    tl3 += self._tri[history][tag]

                # if c3, and c2 are equal and larger than c1
                elif (c3 == c2) and (c3 > c1):
                    tl2 += float(self._tri[history][tag]) /2.0
                    tl3 += float(self._tri[history][tag]) /2.0

                # if c1, and c2 are equal and larger than c3
                # this might be a dumb thing to do....(not sure yet)
                elif (c2 == c1) and (c1 > c3):
                    tl1 += float(self._tri[history][tag]) /2.0
                    tl2 += float(self._tri[history][tag]) /2.0

                # otherwise there might be a problem
                # eg: all values = 0
                else:
                    #print "Problem", c1, c2 ,c3
                    pass

        # Lambda normalisation:
        # ensures that l1+l2+l3 = 1
        self._l1 = tl1 / (tl1+tl2+tl3)
        self._l2 = tl2 / (tl1+tl2+tl3)
        self._l3 = tl3 / (tl1+tl2+tl3)



    def _safe_div(self, v1, v2):
        '''
        Safe floating point division function, does not allow division by 0
        returns -1 if the denominator is 0
        '''
        if v2 == 0:
            return -1
        else:
            return float(v1) / float(v2)

    def tagdata(self, data):
        '''
        Tags each sentence in a list of sentences

        :param data:list of list of words
        :type data: [[string,],]
        :return: list of list of (word, tag) tuples

        Invokes tag(sent) function for each sentence
        compiles the results into a list of tagged sentences
        each tagged sentence is a list of (word, tag) tuples
        '''
        res = []
        for sent in data:
            res1 = self.tag(sent)
            res.append(res1)
        return res


    def tag(self, data):
        '''
        Tags a single sentence

        :param data: list of words
        :type data: [string,]

        :return: [(word, tag),]

        Calls recursive function '_tagword'
        to produce a list of tags

        Associates the sequence of returned tags
        with the correct words in the input sequence

        returns a list of (word, tag) tuples
        '''

        current_state = [(['BOS', 'BOS'], 0.0)]

        sent = list(data)

        tags = self._tagword(sent, current_state)

        res = []
        for i in range(len(sent)):
            # unpack and discard the C flags
            (t,C) = tags[i+2]
            res.append((sent[i], t))

        return res


    def _tagword(self, sent, current_states):
        '''
        :param sent : List of words remaining in the sentence
        :type sent  : [word,]
        :param current_states : List of possible tag combinations for
                                the sentence so far, and the log probability
                                associated with each tag combination
        :type current_states  : [([tag, ], logprob), ]

        Tags the first word in the sentence and
        recursively tags the reminder of sentence

        Uses formula specified above to calculate the probability
        of a particular tag
        '''

        # if this word marks the end of the sentance,
        # return the most probable tag
        if sent == []:
            (h, logp) = current_states[0]
            return h

        # otherwise there are more words to be tagged
        word = sent[0]
        sent = sent[1:]
        new_states = []

        # if the Capitalisation is requested,
        # initalise the flag for this word
        C = False
        if self._C and word[0].isupper(): C=True

        # if word is known
        # compute the set of possible tags
        # and their associated log probabilities
        if word in self._wd.conditions():
            self.known += 1

            for (history, curr_sent_logprob) in current_states:
                logprobs = []

                for t in self._wd[word].samples():
                    p_uni = self._uni.freq((t,C))
                    p_bi = self._bi[history[-1]].freq((t,C))
                    p_tri = self._tri[tuple(history[-2:])].freq((t,C))
                    p_wd = float(self._wd[word][t])/float(self._uni[(t,C)])
                    p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri
                    p2 = log(p, 2) + log(p_wd, 2)

                    logprobs.append(((t,C), p2))


                # compute the result of appending each tag to this history
                for (tag, logprob) in logprobs:
                    new_states.append((history + [tag],
                                       curr_sent_logprob + logprob))




        # otherwise a new word, set of possible tags is unknown
        else:
            self.unknown += 1

            # since a set of possible tags,
            # and the probability of each specific tag
            # can not be returned from most classifiers:
            # specify that any unknown words are tagged with certainty
            p = 1

            # if no unknown word tagger has been specified
            # then use the tag 'Unk'
            if self._unk is None:
                tag = ('Unk',C)

            # otherwise apply the unknown word tagger
            else :
                [(_w, t)] = list(self._unk.tag([word]))
                tag = (t,C)

            for (history, logprob) in current_states:
                history.append(tag)

            new_states = current_states



        # now have computed a set of possible new_states

        # sort states by log prob
        # set is now ordered greatest to least log probability
        new_states.sort(reverse=True, key=itemgetter(1))

        # del everything after N (threshold)
        # this is the beam search cut
        if len(new_states) > self._N:
            new_states = new_states[:self._N]


        # compute the tags for the rest of the sentence
        # return the best list of tags for the sentence
        return self._tagword(sent, new_states)
コード例 #55
0
ファイル: session-3.py プロジェクト: datakid/nltk
for filename in files:
    f = open(os.path.join(corpus_path, filename), "r")
    metadata, raw_text = f.read().split("<!--end metadata-->")
    all_metadata.append(parse_metadata(metadata))


# <markdowncell>
# Now that we're confident that the function works, let's find out a bit about the corpus.
# As a start, it would be useful to know which years the texts are from. Are they evenly distributed over time? A graph will tell us!

# <codecell>
#import conditional frequency distribution
from nltk.probability import ConditionalFreqDist
import matplotlib
% matplotlib inline
cfdist = ConditionalFreqDist()
for filename in os.listdir(corpus_path):
    text = open(os.path.join(corpus_path, filename)).read()
    #split text of file on 'end metadata'
    text = text.split("<!--end metadata-->")
    #parse metadata using previously defined function "parse_metadata"
    metadata = parse_metadata(text[0])
    #skip all speeches for which there is no exact date
    if metadata['Date'][0] == 'c':
        continue
    #build a frequency distribution graph by year, that is, take the final bit of the 'Date' string after '/'
    cfdist['count'][metadata['Date'].split('/')[-1]] += 1
cfdist.plot()

# <markdowncell>
# Now let's build another graph, but this time by the 'Description' field:
コード例 #56
0
ファイル: ngram.py プロジェクト: curtislb/ReviewTranslation
    def __init__(self, n, train, pad_left=True, pad_right=False,
                 estimator=None, **estimator_kwargs):
        """
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training. See model.doctest for more detailed testing
            >>> from nltk.corpus import brown
            >>> lm = NgramModel(3, brown.words(categories='news'))
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
            >>> lm.entropy(brown.words(categories='humor'))
            ... # doctest: +ELLIPSIS
            12.0399...
        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str) or list(list(str))
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
        :type pad_right: bool
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

        # For explicitness save the check whether this is a unigram model
        self.is_unigram_model = (n == 1)
        # save the ngram order number
        self._n = n
        # save left and right padding
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()

        # set read-only ngrams set (see property declaration below to reconfigure)
        self._ngrams = set()
        '''
        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], compat.string_types):
            train = [train]
        '''
        # we need to keep track of the number of word types we encounter
        vocabulary = set()
        count = 0
        #for review in train:
        for review in read_reviews(train):

            count += 1
            if count % 10000 == 0:
                print str(count) + ' reviews processed'

            #for testing with small training set
            #if count > 100000:
            #    break

            #newly added, each element is dict of each review
            review_text = review['text']

            #separate into tokens, lowercase
            tokens = word_tokenize(review_text)
            tokens = [w.lower() for w in tokens]

            #updated for new nltk api
            raw_ngrams = ngrams(tokens, n, pad_left, pad_right, left_pad_symbol='', right_pad_symbol='...EOR...')
            for ngram in raw_ngrams:
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context][token] += 1
                vocabulary.add(token)

        # Unless number of bins is explicitly passed, we should use the number
        # of word types encountered during training as the bins value.
        # If right padding is on, this includes the padding symbol.
        
        if 'bins' not in estimator_kwargs:
            estimator_kwargs['bins'] = len(vocabulary) * 2

        self._model = ConditionalProbDist(cfd, estimator, **estimator_kwargs)
        self._probdist = self._model
        
        
        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n-1, train,
                                        pad_left, pad_right,
                                        estimator,
                                        **estimator_kwargs)

            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0

                # this is the subset of words that we OBSERVED following
                # this context.
                # i.e. Count(word | context) > 0
                for word in self._words_following(ctxt, cfd):
                    total_observed_pr += self.prob(word, ctxt)
                    # we also need the total (n-1)-gram probability of
                    # words observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)

                assert (0 <= total_observed_pr <= 1), total_observed_pr
                # beta is the remaining probability weight after we factor out
                # the probability of observed words.
                # As a sanity check, both total_observed_pr and backoff_total_pr
                # must be GE 0, since probabilities are never negative
                beta = 1.0 - total_observed_pr

                # backoff total has to be less than one, otherwise we get
                # an error when we try subtracting it from 1 in the denominator
                assert (0 <= backoff_total_pr < 1), backoff_total_pr
                alpha_ctxt = beta / (1.0 - backoff_total_pr)

                self._backoff_alphas[ctxt] = alpha_ctxt
コード例 #57
0
ファイル: text.py プロジェクト: prz3m/kind2anki
class ContextIndex(object):
    """
    A bidirectional index between words and their 'contexts' in a text.
    The context of a word is usually defined to be the words that occur
    in a fixed window around the word; but other definitions may also
    be used by providing a custom context function.
    """

    @staticmethod
    def _default_context(tokens, i):
        """One left token and one right token, normalized to lowercase"""
        left = tokens[i - 1].lower() if i != 0 else '*START*'
        right = tokens[i + 1].lower() if i != len(tokens) - 1 else '*END*'
        return (left, right)

    def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD(
            (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)
        )
        self._context_to_words = CFD(
            (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)
        )

    def tokens(self):
        """
        :rtype: list(str)
        :return: The document that this context index was
            created from.
        """
        return self._tokens

    def word_similarity_dict(self, word):
        """
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        """
        word = self._key(word)
        word_contexts = set(self._word_to_contexts[word])

        scores = {}
        for w, w_contexts in self._word_to_contexts.items():
            scores[w] = f_measure(word_contexts, set(w_contexts))

        return scores

    def similar_words(self, word, n=20):
        scores = defaultdict(int)
        for c in self._word_to_contexts[self._key(word)]:
            for w in self._context_to_words[c]:
                if w != word:
                    scores[w] += (
                        self._context_to_words[c][word] * self._context_to_words[c][w]
                    )
        return sorted(scores, key=scores.get, reverse=True)[:n]

    def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:", " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(
                c for w in words for c in self._word_to_contexts[w] if c in common
            )
            return fd
コード例 #58
0
ファイル: classifier.py プロジェクト: ps13/ResV1
print "Negative: " + str(n)
print "Neutral: " + str(nt)
        


# Extracting features
# Using the feature set provided
#fvecs = [(make_tweet_dict(t),s) for (t,s) in tweets]

# Extracting features from data
fvecs = [(get_tweet_features(t, set()),s) for (t,s) in tweets]
#pprint.pprint(fvecs)

# Extract best word features
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
#
for (feats, label) in fvecs:
  #print label
  for key in feats:
    #print key
    if feats[key]:
      word_fd.inc(key)
      #print word_fd
      label_word_fd[label].inc(key)
      #print label_word_fd[label]
#
##print word_fd['positive']
##print label_word_fd      
print label_word_fd.conditions()
cls_set=label_word_fd.conditions()
コード例 #59
0
ファイル: workshop2_ex3.py プロジェクト: lberezy/LangComp
from nltk.corpus import brown
from nltk.probability import ConditionalFreqDist



genres = ['news', 'romance']
days = set("Monday Tuesday Wednesday Thursday Friday Saturday Sunday".split())


cfd = ConditionalFreqDist(
	(genre,word)
	for genre in genres
	for word in brown.words(categories=genre)
	if word in days
	)

cfd.tabulate()
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.chi_sq))#Works best for this Data
#evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.jaccard))
#evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.likelihood_ratio))


# In[3]:

from nltk.collocations import *
from nltk.probability import FreqDist
from nltk.probability import ConditionalFreqDist
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

testNegWords = movie_reviews.words(categories=['pos'])
testPosWords = movie_reviews.words(categories=['neg'])

for word in testNegWords:
    word_fd[word.lower()]+=1
    label_word_fd['neg'][word.lower()]+=1
for word in testPosWords:
    word_fd[word.lower()]+=1
    label_word_fd['pos'][word.lower()]+=1
print(word_fd.N(),word_fd.B(),word_fd.most_common(20))
print(label_word_fd.N(),label_word_fd.conditions(),label_word_fd.items())
print(label_word_fd['pos'].N(),label_word_fd['neg'].N())