def cDist(self, params): """return conditional freq distribution (based on part of speech) using filtered_words from loadData""" president = params["president"] speech = params["speech"] if self.president == "All presidents": pipeline = [{"$match": {"type": speech}}, {"$project": {"tags": "$filtered_speech_tags"}}] else: pipeline = [ {"$match": {"name": president, "type": speech}}, {"$project": {"tags": "$filtered_speech_tags"}}, ] tags = [] for i in self.col.aggregate(pipeline): tags.extend(i["tags"]) cfdist = ConditionalFreqDist() # conditioned on pos_tag for word, tag in tags: condition = tag # specify condition to group frequencies by cfdist[condition][word] += 1 VB = MLEProbDist(cfdist.get("VBP")) NN = MLEProbDist(cfdist.get("NN")) JJ = MLEProbDist(cfdist.get("JJ")) return VB, NN, JJ # return verbs, nouns, adjectives
def Ae_kappa(self, cA, cB): Ae = 0.0 nitems = float(len(self.I)) label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data) for k in label_freqs.conditions(): Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) return Ae
def _setSelectedPOSTags(self): buff = self._loadData('selective_pos.bin') if buff: self.selective_pos = buff return #First get all (word, tag) in corpuses sentences = brown.tagged_sents(simplify_tags=True) self.selected_tags = ["ADJ","ADV", "CNJ"] self.selective_pos = ConditionalFreqDist() temp_dist = ConditionalFreqDist() for sentence in sentences: for (word, tag) in sentence: if tag in self.selected_tags: temp_dist[tag].inc(str(word).lower()) #Now, get the words with frequency > 10 for category in temp_dist.conditions(): fredist = temp_dist[category] for key in fredist.keys(): if fredist[key] > 4: self.selective_pos[category].inc(key) self._saveData('selective_pos.bin',self.selective_pos)
def high_information_words(labeled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): """ To eliminate low information feature words for set of words for EFFICIENCY :param labeled_words: list of 2 tuples [(label, words)] label -> is a classification label (pos / neg) words -> is a list of words that occur under that label :param score_fn: a scoring function to measure how informative that word is :param min_score: the minimum score for a word to be included as MOST INFORMATIVE WORD :return: a set of high informative words """ print "Counting Word Frequencies" word_fq = FreqDist() labeled_word_fq = ConditionalFreqDist() for label, words in labeled_words: for word in words: word_fq[word] += 1 labeled_word_fq[label][word] += 1 n_xx = labeled_word_fq.N() high_info_words = set() for label in labeled_word_fq.conditions(): n_xi = labeled_word_fq[label].N() word_scores = collections.defaultdict(int) for word, n_ii in labeled_word_fq[label].iteritems(): n_ix = word_fq[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.iteritems() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def readFormatedData(formatedData): #unigramFd = FreqDist() #bigramFd = FreqDist() cBigramFd1 = ConditionalFreqDist() cBigramFd2 = ConditionalFreqDist() #dict1 = Set([]) #dict2 = Set([]) for tuple in formatedData: words = tuple[0].split(' ') count = int(tuple[1]) #unigramFd.inc(words[0]) #unigramFd.inc(words[1]) #bigramFd.inc((words[0], words[1]), count) word2 = words[1] if count < 5: word2 = "unknown" cBigramFd1[words[0]].inc(word2, count) #if words[0] not in dict1: # dict1.add(words[0]) #if words[1] not in dict2: # dict2.add(words[1]) for w1 in cBigramFd1.conditions(): bigram_w1 = cBigramFd1[w1] for w2 in bigram_w1.samples(): cBigramFd2[w2].inc(w1, bigram_w1[w2]) return cBigramFd1, cBigramFd2#, dict1, dict2
def get_high_information_words(lwords, score_fn=BigramAssocMeasures.chi_sq, min_score=5): labels = lwords.keys() labelled_words = [(l, lwords[l]) for l in labels] word_freq_dist = FreqDist() label_word_freq_dist = ConditionalFreqDist() for label, dwords in labelled_words: for words in dwords: for word in words: word_freq_dist[word] += 1 label_word_freq_dist[label][word] += 1 n_words_total = label_word_freq_dist.N() high_info_words = set() for label in label_word_freq_dist.conditions(): n_words_label = label_word_freq_dist[label].N() word_scores = defaultdict(int) for word, word_freq_label in label_word_freq_dist[label].items(): word_freq = word_freq_dist[word] score = score_fn(word_freq_label, (word_freq, n_words_label), n_words_total) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd.inc(word) label_word_fd[label].inc(word) n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].iteritems(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.iteritems() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ Initialize this ContextTagger's ``_context_to_tag`` table based on the given training data. In particular, for each context ``c`` in the training data, set ``_context_to_tag[c]`` to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of ``self._context_to_tag`` (if any) is discarded. :param tagged_corpus: A tagged corpus. Each item should be a list of (word, tag tuples. :param cutoff: If the most likely tag for a context occurs fewer than cutoff times, then exclude it from the context-to-tag table for the new tagger. """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one( tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() hits = fd[context][best_tag] if hits > cutoff: self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0) / token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print("[Trained Unigram tagger:", end=' ') print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning))
def __init__(self, r, name, cond_samples=None): self._r = r self._name = name ConditionalFreqDist.__init__(self, cond_samples) # initialize self._fdists for all matching keys for key in self._r.keys(encode_key('%s:*' % name)): condition = key.split(':')[1] self[condition] # calls self.__getitem__(condition)
def __init__(self, r, name, cond_samples=None): self._r = r self._name = name ConditionalFreqDist.__init__(self, cond_samples) for key in self._r.keys(encode_key('%s:*' % name)): condition = key.split(b':')[1].decode() self[condition] # calls self.__getitem__(condition)
def words_by_followers(category): """Given a category from the brown corpus, lowercases everything, and returns a frequency distribution where the keys are words and the counts are the number of different contexts that each word can appear in.""" bigrams = brown_bigrams(category) cfdist = ConditionalFreqDist((bigram[1], bigram[0]) for bigram in bigrams) fdist = FreqDist() for context in cfdist.keys(): fdist[context] = len(cfdist[context]) return fdist
def _train(self, tagged_corpus, cutoff=0, verbose=False): token_count = hit_count = 0 useful_contexts = set() fd = ConditionalFreqDist() tag_prob = FreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 tag_prob.inc(tag) context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context].inc(tag) # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, # calculate the entropy. Only include contexts that # lower then `cutoff` . total_tags = float(sum(tag_prob.values())) tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()] useful_contexts_after_filter = useful_contexts.copy() most_high = FreqDist() for context in useful_contexts: dd = fd[context] # total_tags = float(sum(dd.values())) # tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()] h = self.H(dd.keys(),tags_probs) if h > cutoff: useful_contexts_after_filter.remove(context) continue most_high[context] = h print most_high.keys() # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. for context in useful_contexts_after_filter: best_tag = fd[context].max() hits = fd[context][best_tag] self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
def __init__(self, unk=None, Trained=False, N=1000, C=False): ''' Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk:(TaggerI) :param Trained: Indication that the POS tagger is trained or not :type Trained: boolean :param N: Beam search degree (see above) :type N:(int) :param C: Capitalization flag :type C: boolean Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger ''' self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0
def __init__(self, load_from_disk=True): self._corpus = reuters.words() self._unigram_fd = FreqDist() self._bigram_cfd = ConditionalFreqDist() self._trigram_cfd = ConditionalFreqDist() self._quadgram_cfd = ConditionalFreqDist() self._unigram_pd = None self._bigram_cpd = None self._trigram_cpd = None self._quadgram_cpd = None if load_from_disk: self._load_models() else: self._train()
def validate_pcfg_generate(grammar): pd = makeLhrProbDict(grammar) productions = [] cfd = ConditionalFreqDist() for i in np.arange(1000): tree = pcfg_generate(grammar) productions += tree.productions() for p in productions: cfd[p.lhs()].inc(p.rhs()) for c in cfd.conditions(): p = MLEProbDist(cfd[c]) q = pd[c] div = KL_Divergence(p, q) print "KL_Divergence for %s = %f" % (c, div)
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: #best_tag = fd[context].max() for (tag, hits) in fd[context].items(): if hits > cutoff: self._contexts_to_tags[context] = self._contexts_to_tags.get(context, {}) self._contexts_to_tags[context][tag] = hits hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning)
def jieba_words_feature(num=2400): # num为特征维度 # print('结巴分词') pos_words = [] neg_words = [] # 将分词完成后的词语分类存进集合 for words in read_text('zl_pos.txt'): for word in words: pos_words.append(word) for words in read_text('zl_neg.txt'): for word in words: neg_words.append(word) # 用FreqDist来表示单词的整体频率,ConditionalFreqDist的条件是类别标签 word_f = FreqDist() # FreDist()构建出一个词为key,词频为value,按词频由大到小排列 both_word_f = ConditionalFreqDist() for word in pos_words: word_f[word] += 1 both_word_f['pos'][word] += 1 # print('pos:', word_f[word]) # print(both_word_f.N()) for word in neg_words: word_f[word] += 1 both_word_f['neg'][word] += 1 # print('neg:', word_f[word]) # print(word_f.items()) # print(both_word_f.N()) pos_words_num = both_word_f['pos'].N() neg_words_num = both_word_f['neg'].N() words_num = pos_words_num + neg_words_num # 用BigramAssocMeasures.chi_sq函数(卡方)为词汇计算评分,然后按分数排序,放入一个集合里 word_scores = {} for word, freq in word_f.items(): pos_score = BigramAssocMeasures.chi_sq(both_word_f['pos'][word], (freq, pos_words_num), words_num) # print('pos:', pos_score) neg_score = BigramAssocMeasures.chi_sq(both_word_f['neg'][word], (freq, neg_words_num), words_num) word_scores[word] = pos_score + neg_score # 该词语总信息量 best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[:num] # 倒叙排序 best_words = set([w for w, s in best_vals]) print(best_words) h = open('zl_best_words.txt', 'w+', encoding='utf-8') h.write(str(best_words)) h.close() # print(dict([(word, True) for word in best_words])) return dict([(word, True) for word in best_words])
def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} @param estimator: a function for generating a probability distribution @type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} @param estimator_args: Extra arguments for C{estimator}. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying C{ConditionalFreqDist} are passed to the estimator as an argument. @type estimator_args: (any) @param estimator_kw_args: Extra keyword arguments for C{estimator}. @type estimator_kw_args: (any) """ self._n = n if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() self._prefix = ('', ) * (n - 1) for ngram in ingrams(chain(self._prefix, train), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) if (not estimator_args) and (not estimator_kw_args): self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n - 1, train, estimator, *estimator_args, **estimator_kw_args)
def bigramAnalysis(self): label_word_fd = ConditionalFreqDist() word_fd = FreqDist() datafiles = [ { 'emo': "Sad", 'name': "/negative.csv" }, { 'emo': "Happy", 'name': "/positive.csv" } # , {'emo': 'Happy', 'name': "/trust.csv"}, {'emo': 'Sad', 'name': "/anger.csv"} ] for value in datafiles: emo = value['emo'] name = value['name'] read = self.readFile(name) normalized_sentences = [s.lower() for s in read['tweets']] for statement in normalized_sentences: for word in statement.split(): wor = word.lower() if word not in stopset: word_fd[word] += 1 label_word_fd[emo][word] += 1 # word_fd.inc(word.lower()) word_scores = {} pos_word_count = label_word_fd['Happy'].N() neg_word_count = label_word_fd['Sad'].N() total_word_count = word_fd.N() for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( label_word_fd['Happy'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['Sad'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:500] self.bestwords = set([w for w, s in best]) print("\n\nevaluating best word features") self.unigramAnalysis(self.best_word_feats) print("\n\nBigram + bigram chi_sq word ") self.unigramAnalysis(self.best_bigram_word_feats)
def train_transitions(labelled_sequences, additional_transitions, estimator=None): # default to the MLE estimate if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = [] known_states = [] starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[0] symbol = token[1] if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 outputs[state][symbol] += 1 lasts = state # update the state and symbol lists if state not in known_states: known_states.append(state) if symbol not in known_symbols: known_symbols.append(symbol) # create probability distributions (with smoothing) N = len(known_states) pi = estimator(starting, N) A = ConditionalProbDist( ConditionalFreqDist.__add__(transitions, additional_transitions), estimator, N) B = ConditionalProbDist(outputs, estimator, len(known_symbols)) return hmm.HiddenMarkovModelTagger(known_states, known_symbols, A, B, pi)
def transitionProb(sentences): transitions = ConditionalFreqDist() for sent in sentences: lasts = None for token in sent: if lasts is None: pass else: transitions[lasts][token] += 1 lasts = token return transitions
def makeTrigram(corpus): '''For trigram''' corpus = startEndTag(corpus) trigram = ConditionalFreqDist() context = END_LINE + '$%' + START_LINE for sentence in corpus: for word in sentence: if word != START_LINE: trigram[context][word] += 1 context = context[context.find('$%') + 2:] + '$%' + word return trigram
def __init__(self, n, train, pad_left=True, pad_right=False,estimator=None, *estimator_args, **estimator_kwargs): super(MyNgramModel,self).__init__(n,train,pad_left,pad_right,estimator,*estimator_args, **estimator_kwargs) assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) self._n = n self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator self._cfd = ConditionalFreqDist() self._ngrams = set() # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], basestring): train = [train] for sent in train: for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] self._cfd[context].inc(token) if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(self._cfd, estimator, len(self._cfd)) else: self._model = ConditionalProbDist(self._cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models self._backoff = None if n > 1: self._backoff = MyNgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) if self._backoff is not None: self._backoff_alphas = dict() # For each condition (or context) for ctxt in self._cfd.conditions(): pd = self._model[ctxt] # prob dist for this context backoff_ctxt = ctxt[1:] backoff_total_pr = 0 total_observed_pr = 0 for word in self._cfd[ctxt].keys(): # this is the subset of words that we OBSERVED backoff_total_pr += self._backoff.prob(word,backoff_ctxt) total_observed_pr += pd.prob(word) assert total_observed_pr <= 1 and total_observed_pr > 0 assert backoff_total_pr <= 1 and backoff_total_pr > 0 alpha_ctxt = (1.0-total_observed_pr) / (1.0-backoff_total_pr) self._backoff_alphas[ctxt] = alpha_ctxt
def laplace_stuff(): sent = "am ate ate apple am x." sent_tokenized = word_tokenize(sent) freq_dist = FreqDist(word.lower() for word in word_tokenize(sent)) print(freq_dist.items()) lap = probability.LaplaceProbDist(freq_dist) print(lap.generate()) print(lap.prob("am")) print("Finished freq dist, Starting Cond dist") # Cond Probabilty cond_dist = ConditionalFreqDist() context = None tokens = sent_tokenized # The type of the preceeding word for token in tokens: outcome = token cond_dist[context] = (outcome) context = token print(cond_dist["am"]) print(cond_dist.items())
def _dump_cpdist(cpdist: ConditionalProbDist) -> dict: cfdist = ConditionalFreqDist() for cond in cpdist.conditions(): for k, v in cpdist[cond].freqdist().items(): cfdist[cond][k] += v return { 'cfdist': cfdist, 'factory_args': cpdist._factory_args, 'factory_kw_args': cpdist._factory_kw_args, }
def train( cls, docs: Collection[Document], gamma_word: float = 0.1, gamma_init: float = 0.1, gamma_trans: float = 0.1, tf_table: Optional[Mapping[Word, float]] = None, ) -> 'HMMSummarizer': """Train the model on a collection of documents. Args: docs (Collection[Document]): The collection of documents to train on. gamma_word (float): Smoothing value for the "word probability in a document" feature. gamma_init (float): Smoothing value for the initial probability. gamma_trans (float): Smoothing value for the transition probability. tf_table (Mapping[Word, float]): A precomputed term-frequency table that is already normalized. Returns: HMM: The trained model. """ init_fdist = FreqDist() trans_fdist = ConditionalFreqDist() tagged_vecs: list = [] states = set() for doc in docs: tags = cls._get_tags(doc.sentences) if not tags: continue init_fdist[tags[0]] += 1 for prev, tag in zip(tags, tags[1:]): trans_fdist[prev][tag] += 1 vecs = cls._get_feature_vectors(doc, gamma_word, tf=tf_table) tagged_vecs.extend(zip(vecs, tags)) states.update(tags) # Initial probability init_pdist = LidstoneProbDist(init_fdist, gamma_init, bins=len(states)) # Transition probability trans_pdist = ConditionalProbDist(trans_fdist, LidstoneProbDist, gamma_trans, bins=len(states)) # Emission probability emit_pdist = _GaussianEmission.train(tagged_vecs) return cls(init_pdist, trans_pdist, emit_pdist, list(states), gamma=gamma_word, tf_table=tf_table)
def create_bestbigrams(self): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() score_fn = BigramAssocMeasures.chi_sq cut = int((self.total / 2) * 3 / 4) for unigrams in self.unigrams_pos[:cut]: bigram_finder = BigramCollocationFinder.from_words(unigrams) try: bigrams = bigram_finder.nbest(score_fn, self.bigram_count) except: continue for word in bigrams: word_fd[word] += 1 label_word_fd['pos'][word] += 1 for unigrams in self.unigrams_neg[:cut]: bigram_finder = BigramCollocationFinder.from_words(unigrams) try: bigrams = bigram_finder.nbest(score_fn, self.bigram_count) except: continue for word in bigrams: word_fd[word] += 1 label_word_fd['neg'][word] += 1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score inf_limit = round(len(word_scores.items()) * self.inf_count) # print("inf_count:" + str(self.inf_count)) # print("total: " + str(len(word_scores.items()))) # print("limit: " + str(inf_limit)) best = sorted(word_scores.items(), key=lambda tup: tup[1], reverse=True)[:1000] print(best) bestwords = set([w for w, s in best]) print(bestwords) print(len(bestwords)) self.bestbigrams = bestwords
def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x): self._key = key self._tokens = tokens if not context_func: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)) self._context_to_words = CFD((self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens))
def __init__(self, n, words, start_symbol="<$>", end_symbol="</$>", pad_left=True, pad_right=False, estimator=ml_estimator): assert (n > 0) self._n=n self._words=words self._counter=ConditionalFreqDist() self._start_symbol=start_symbol self._end_symbol=end_symbol self._pad_left=pad_left self._pad_right=pad_right self._train() super().__init__(self._counter, estimator)
def create_word_scores(): # creates lists of all positive and negative words posWords = [] negWords = [] conWords = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) with open(RT_POLARITY_CON_FILE, 'r') as conSentences: for i in conSentences: conWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) conWords.append(conWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) conWords = list(itertools.chain(*conWords)) # build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 cond_word_fd['pos'][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 cond_word_fd['neg'][word.lower()] += 1 for word in conWords: word_fd[word.lower()] += 1 cond_word_fd['con'][word.lower()] += 1 # finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() con_word_count = cond_word_fd['con'].N() total_word_count = pos_word_count + neg_word_count + con_word_count # builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) con_score = BigramAssocMeasures.chi_sq(cond_word_fd['con'][word], (freq, con_word_count), total_word_count) word_scores[word] = pos_score + neg_score + con_score return word_scores
def doesnt_work(self, y): """ Code adapted from NLTK implementation of supervised training in HMMs """ estimator = lambda fdist, bins: MLEProbDist(fdist) transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in y: lasts = None for state in sequence: if lasts is not None: transitions[lasts][state] += 1 lasts = state N = self.number_of_states + 2 model = ConditionalProbDist(transitions, estimator, N) return model
def create_word_scores(): # creates lists of all positive and negative words posWords = [] negWords = [] sentences = read_in_tweets(twitter_data) random.shuffle(sentences) sentences = sentences[:100000] posSentences = [] negSentences = [] for tup in sentences: if tup[0]=='0': negSentences.append(tup[1]) if tup[0]=='4': posSentences.append(tup[1]) for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) # build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word.lower()) cond_word_fd['pos'].inc(word.lower()) for word in negWords: word_fd.inc(word.lower()) cond_word_fd['neg'].inc(word.lower()) # finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count # builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def commit(self): cfd = self._journal self._journal = ConditionalFreqDist() for cond in cfd.keys(): olddist = self[cond] cfd[cond] += olddist self._sql.executemany('insert or replace into cfd(k, v) values (?, ?)', ((cond, freqdist) for cond, freqdist in cfd.items()))
def get_repk_count(ngram_count): counts = ConditionalFreqDist() total_counts = defaultdict(int) for ngram, count in ngram_count.items(): context = len(ngram) - 1 total_counts[context] += count if len(ngram) == 1: continue if ngram[0] == ngram[-1]: counts[context][ngram[-1]] += count return counts, total_counts
def sum_category_word_scores(categorized_words, score_fn): word_fd = FreqDist() category_word_fd = ConditionalFreqDist() for category, words in categorized_words: for word in words: word_fd.inc(word) category_word_fd[category].inc(word) scores = collections.defaultdict(int) n_xx = category_word_fd.N() for category in category_word_fd.conditions(): n_xi = category_word_fd[category].N() for word, n_ii in iteritems(category_word_fd[category]): n_ix = word_fd[word] scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx) return scores
def sum_category_word_scores(categorized_words, score_fn): word_fd = FreqDist() category_word_fd = ConditionalFreqDist() for category, words in categorized_words: for word in words: word_fd.inc(word) category_word_fd[category].inc(word) scores = collections.defaultdict(int) n_xx = category_word_fd.N() for category in category_word_fd.conditions(): n_xi = category_word_fd[category].N() for word, n_ii in category_word_fd[category].iteritems(): n_ix = word_fd[word] scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx) return scores
class hmm: def __init__(self, name=0, tag=0): self.name = name self.tag = tag self.wsj = nltk.corpus.brown.tagged_words() self.sentences = nltk.corpus.brown.sents() #self.cfdTagAll = ConditionalFreqDist(tag, word for (word, tag) in self.wsj) def findTags(self, mostCommon=5): if self.tag != 0: self.tag_prefix = self.tag self.cfdTag = ConditionalFreqDist( (tag, word) for (word, tag) in self.wsj if tag.startswith(self.tag_prefix)) return dict((tag, self.cfdTag[tag].most_common(mostCommon)) for tag in self.cfdTag.conditions()) else: print("invalid method") def findAllTags(self, mostCommon=5): self.cfdTagAll = ConditionalFreqDist( (tag, word) for (word, tag) in self.wsj) for tag in sorted(self.cfdTagAll): print(tag, self.cfdTagAll[tag].most_common()) #print(self.cfdTagAll) return dict(self.cfdTagAll) def findBigrams(self): self.bigram = bigrams([tag for word, tag in self.wsj]) return self.bigram def biFrekvens(self, mostCommon=5): self.cfdBigram = ConditionalFreqDist(self.bigram) return dict((tag, self.cfdBigram[tag].most_common(mostCommon)) for tag in self.cfdBigram) def findName(self, mostCommon=5): if self.name != 0: self.cfdName = ConditionalFreqDist( (word.lower(), tag) for (word, tag) in self.wsj) return [self.name, self.cfdName[self.name].most_common(mostCommon)] else: print("invalid method") def findCPD(self, typecfd=None): if (typecfd == None): self.cpdTag = nltk.ConditionalProbDist(self.cfdTag, nltk.MLEProbDist) return self.cpdTag elif (typecfd == "bi"): return ConditionalProbDist(self.cfdBigram, nltk.MLEProbDist) else: print("invalid method")
def significantWords(untagged_docs, min_chisq=5, ratio=0.75): """ Use chisq test of bigram contingency table to measure the association of token with its sentiment Parameters ---------- untagged_docs: list of tuples (words, tag) min_chisq: lower bound of significant ratio: pos/neg ratio, used to determine the sentiment of a word Returns ------- significant_words: a 3-key-dict of words set """ significant_words = collections.defaultdict(set) freq_dist = FreqDist() label_freq_dist = ConditionalFreqDist() stopping_words = set(nltk.corpus.stopwords.words('english')) for tokens, label in untagged_docs: for token in tokens: if token.isalpha() and not (token in stopping_words): freq_dist.inc(token) label_freq_dist[label].inc(token) n_xx = label_freq_dist.N() #pdb.set_trace() for label in label_freq_dist.conditions(): for word, n_ii in label_freq_dist[label].iteritems(): n_xi = label_freq_dist[label].N() n_ix = freq_dist[word] n_oi = n_xi-n_ii n_io = n_ix-n_ii n_oo = n_xx-n_oi-n_io-n_ii chisq = float(n_xx*(n_ii*n_oo - n_io*n_oi)**2)\ /((n_ii+n_io)*(n_ii+n_oi)*(n_oo+n_io)*(n_oo+n_oi)) if chisq > min_chisq and n_ii>10: significant_words['total'] |= set([word]) if float(n_ii)/n_ix > ratio and (n_ix-n_ii) > 1: significant_words[label] |= set([word]) return significant_words
def dependencybigram(n, lms, wds, trs): estimator = lidstone_estimator cfd = ConditionalFreqDist() for lm, wd, tr in izip(lms, wds, trs): for bgram in dep_bigram(n, lm, wd, tr): _DPNGRAMS.add(bgram) context = bgram[:-1] token = bgram[-1] cfd[context][token] += 1 _DPMODEL = ConditionalProbDist(cfd, estimator, len(cfd)) if n > 1: _DPBACKOFF = dependencybigram(n - 1, lms, wds, trs)
def save_MEMM(duilians, v_size): bigram = [] for duilian in duilians: shanglian = duilian[0] xialian = duilian[1] bigram += [((shang_duiying, xia_qian), xia_hou) for shang_duiying, xia_qian, xia_hou in zip( shanglian[1:], xialian, xialian[1:])] ngram = ConditionalProbDist(ConditionalFreqDist(bigram), ELEProbDist, v_size) with open(MEMM_save_dir + 'memm.pkl', 'wb') as f: pickle.dump(ngram, f)
def _make_models(self, tuples): self._word_ids = WordIdDictionary() # Extract sequence of words, lemmas, and tags words, lemmas, tags = tuple( map( lambda tokens: list(self._word_ids.add_words_transform(tokens) ), zip(*tuples))) self._tags = tags # Create models for words, lemmas, and tags self._words_ngram = NgramModel(words, self._n) self._lemmas_ngram = NgramModel(lemmas, self._n) self._tags_ngram = NgramModel( tags, 2 * self._n) # Can afford to use 2 * n-gram size for grammar # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively # It's faster to use a list than predicate on unigrams during backoff search self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas)) self._tag_lemma_words = ConditionalFreqDist( zip(zip(tags, lemmas), words))
def LearnCondDist(Corpus, TrainSet, CFDist=None, saveParses=0, verbose=0): if not CFDist: CFDist = ConditionalFreqDist() for item in TrainSet: for sent in Corpus.readDirectionTree(item, saveParses, verbose): CountSubTree(CFDist, sent['TREE']) # Make 'S' the top of the tree ## for cat in ('TRAVEL','TURN','DESC','NAME'): ## for s in CFDist[cat].samples(): CFDist['S'].inc(s,CFDist[cat].count(s)) print '\nRead in training tokens for Conditional Frequency:', CFDist return CFDist
def process_bigrams(conn, polarity, total_word_count, best_words): cursor = conn.cursor() sql = Statements.GRAM_SQL % polarity cursor.execute(sql) rows = list(cursor.fetchall()) l = [x[0] for x in rows] words_split = map(string.split, l) raw_words = [item for sublist in words_split for item in sublist] words = [] for w in raw_words: if not (w.startswith("http://") or w.startswith("@")): words.append(w) word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in words: word_fd.inc(word.lower()) label_word_fd[polarity].inc(word.lower()) pos_word_count = label_word_fd[polarity].N() word_scores = {} for word, freq in word_fd.iteritems(): score = BigramAssocMeasures.chi_sq(label_word_fd[polarity][word], (freq, pos_word_count), total_word_count) word_scores[word] = score best_raw = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:600] best = [x[0] for x in best_raw if x[0] not in STOPWORDS and len(x[0]) > 1] best_words.update(best) best_features = features(best, polarity) bigram_finder = BigramCollocationFinder.from_words(words) bigram_finder.apply_freq_filter(4) bigrams = bigram_finder.nbest(BigramAssocMeasures.pmi, 10) bigram_list = [] for bt in bigrams: x = "%s %s" % (bt[0].lower(), bt[1].lower()) bigram_list.append(x) bigram_features = features(bigram_list, polarity) best_features += bigram_features return best_features cursor.close()
def _train(self, tagged_corpus: list, cutoff: int = 0, verbose: bool = False): """ Initialize this ContextTagger's ``_context_to_tag`` table based on the given training data. In particular, for each context ``c`` in the training data, set ``_context_to_tag[c]`` to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of ``self._context_to_tag`` (if any) is discarded. :param tagged_corpus: A tagged corpus. Each item should be a list of (word, tag) tuples. :param cutoff: If the most likely tag for a context occurs fewer than cutoff times, then exclude it from the context-to-tag table for the new tagger. :param verbose: Not used """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if self.backoff is None or tag != self.backoff.tag_one( tokens, index, tags[:index] ): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() # Remove weighted_tags = [(k, v/sum(fd[context].values())) for k, v in fd[context].items()] hits = fd[context][best_tag] #INT if hits > cutoff: self._context_to_tag[context] = weighted_tags hit_count += hits
def __init__(self): self.bigrams = ConditionalFreqDist() self.unigrams = FreqDist() sentences = nltk.corpus.brown.sents( categories=nltk.corpus.brown.categories()[1:]) for sent in sentences: # Vi utvider setningen med None foran, for å angi start av # setningen, og en None etter, for å markere setningsslutt. sent = [None] + sent + [None] for prev, word in bigrams(sent): self.bigrams[prev][word] += 1 self.unigrams[word] += 1 self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist) self.unigrams = LaplaceProbDist(self.unigrams) ############################# modified lm #################################### # regular expression: self.patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] # regular expression (modified): self.patternsModified = [ (r'(.*able|.*ish|.*ible)$', 'JJ'), # adjectives # 1 (r'(The|the|A|a|An|an)$', 'AT'), # articles # 2 (r'(a|an|my|some|the)$', 'DT'), # determinative # 3 (r'(our|its|his|their|my|your|her|out|thy|mine|thine)$', 'PP$'), # determinative possesive # 4 (r'(.*ily|.*ly)$', 'ADV'), # adverb # 5 (r'(at|in|of|over|with)$', 'PP'), # preposition # 6 (r'(and|because|but|if|or)$', 'CNJ'), # conjuction # 7 (r'([\.?!;:]+)$', '.'), # sentence terminator # 8 (r'(\,)$', ','), # comma # 9 (r'(\-)$', '-'), # dash # 10 (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ]
def __init__(self, trigram_freq, alpha1=0.85, alpha2=0.1, alpha3=0.05, bigram_freq=ConditionalFreqDist(), unigram_freq=FreqDist()): self.alpha1 = alpha1 self.alpha2 = alpha2 self.alpha3 = alpha3 self.trifreqdist = trigram_freq self.bifreqdist = bigram_freq self.unifreqdist = unigram_freq
def makeBigram(corpus): ''' Use a conditional frequency distribution table to store bigram model @return: a bigram model ''' corpus = startEndTag(corpus) bigram = ConditionalFreqDist() context = '' for sentence in corpus: for word in sentence: if word != START_LINE: bigram[context][word] += 1 context = word return bigram
def sum_category_word_scores(categorized_words, score_fn): # get word freq word_fd = FreqDist() # get conditional freq Dist category_word_fd = ConditionalFreqDist() # according to catagory for category, words in categorized_words: for word in words: word_fd.inc(word) category_word_fd[category].inc(word) scores = collections.defaultdict(int) n_xx = category_word_fd.N() for category in category_word_fd.conditions(): n_xi = category_word_fd[category].N() for word, n_ii in category_word_fd[category].iteritems(): n_ix = word_fd[word] scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx) # return the scores return scores
def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)) self._context_to_words = CFD((self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens))
def __init__(self, corpus, n, estimator=None): if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) bi = [] self._l = [] for tree in corpus[:n]: ts = tree.leaves() sent = ['START'] + ts bi += nltk.bigrams(sent) self._l.append(len(sent)) cfd = ConditionalFreqDist(bi) self._model = ConditionalProbDist(cfd, estimator, len(cfd))
def generateModel(MergedText, ngramModel, numSentences): ''' This function discards the sentences where number of words are less than n Generates nGram Model. ''' vocabulary = set(MergedText) MergedText = delete_short( MergedText, ngramModel ) # delete the words from short sentence where words amount is less than n nngrams = boundaries(list(ngrams( MergedText, ngramModel))) #generate the nngrams without cross boundary cfd = ConditionalFreqDist() ngramSet = set() vocabularyOfWords = set() fdist = FreqDist() ProbDictionary = defaultdict(list) #Generate conditional frequency distribution for ngram in nngrams: ngramSet.add(ngram) initial_text = tuple( ngram[:-1]) #this is the initial_text from the ngram (n-1) last_word = ngram[-1] #this is the last word from ngram cfd[initial_text][last_word] += 1 #Smoothing and generating probabilities using Laplace Algorithm #laplace_prob = [1.0 * (1+cfd[initial_text][last_word]) / (len(vocabulary)+cfd[initial_text].N())] ProbDictionary[initial_text].append( last_word) #Storing probability of each word vocabularyOfWords.add(last_word) proDic = { } #this is the probabilty stored for each words after each ngrams for key, value in ProbDictionary.items(): words = set(value) proDic[key] = {} sum_freq = 0 for word in words: fre_word = value.count(word) / float( len(word)) #compute the count of each words for a gram proDic[key][word] = fre_word sum_freq += fre_word for key_, value_ in proDic[key].items(): proDic[key][ key_] = value_ / sum_freq #normalization of the propability #generate sentences generateSentences(nngrams, cfd, ngramModel, proDic, numSentences)
def __init__(self, beam=1000, max_guess=20, rare_treshold=10, capitalization=True): self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._beam_size = beam self._use_capitalization = capitalization self._max_guess = max_guess self._treshold = rare_treshold self._unk = Guesser(10) self._analyzer = None self.cache = {}
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): # gathers the most frequently occuring features to improve classification word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score>= min_score] high_info_words |= set(bestwords) return high_info_words
class TnT(TaggerI): ''' TnT - Statistical POS tagger IMPORTANT NOTES: * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS - It is possible to provide an untrained POS tagger to create tags for unknown words, see __init__ function * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT - Due to the nature of this tagger, it works best when trained over sentence delimited input. - However it still produces good results if the training data and testing data are separated on all punctuation eg: [,.?!] - Input for training is expected to be a list of sentences where each sentence is a list of (word, tag) tuples - Input for tag function is a single sentence Input for tagdata function is a list of sentences Output is of a similar form * Function provided to process text that is unsegmented - Please see basic_sent_chop() TnT uses a second order Markov model to produce tags for a sequence of input, specifically: argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) IE: the maximum projection of a set of probabilities The set of possible tags for a given word is derived from the training data. It is the set of all tags that exact word has been assigned. To speed up and get more precision, we can use log addition to instead multiplication, specifically: argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] + log(P(t_T+1|t_T)) The probability of a tag for a given word is the linear interpolation of 3 markov models; a zero-order, first-order, and a second order model. P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + l3*P(t_i| t_i-1, t_i-2) A beam search is used to limit the memory usage of the algorithm. The degree of the beam can be changed using N in the initialization. N represents the maximum number of possible solutions to maintain while tagging. It is possible to differentiate the tags which are assigned to capitalized words. However this does not result in a significant gain in the accuracy of the results. ''' def __init__(self, unk=None, Trained=False, N=1000, C=False): ''' Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk:(TaggerI) :param Trained: Indication that the POS tagger is trained or not :type Trained: boolean :param N: Beam search degree (see above) :type N:(int) :param C: Capitalization flag :type C: boolean Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger ''' self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0 def train(self, data): ''' Uses a set of tagged data to train the tagger. If an unknown word tagger is specified, it is trained on the same data. :param data: List of lists of (word, tag) tuples :type data: tuple(str) ''' # Ensure that local C flag is initialized before use C = False if self._unk is not None and self._T == False: self._unk.train(data) for sent in data: history = [('BOS',False), ('BOS',False)] for w, t in sent: # if capitalization is requested, # and the word begins with a capital # set local flag C to True if self._C and w[0].isupper(): C=True self._wd[w].inc(t) self._uni.inc((t,C)) self._bi[history[1]].inc((t,C)) self._tri[tuple(history)].inc((t,C)) history.append((t,C)) history.pop(0) # set local flag C to false for the next word C = False self._eos[t].inc('EOS') # compute lambda values from the trained frequency distributions self._compute_lambda() #(debugging -- ignore or delete me) #print "lambdas" #print i, self._l1, i, self._l2, i, self._l3 def _compute_lambda(self): ''' creates lambda values based upon training data NOTE: no need to explicitly reference C, it is contained within the tag variable :: tag == (tag,C) for each tag trigram (t1, t2, t3) depending on the maximum value of - f(t1,t2,t3)-1 / f(t1,t2)-1 - f(t2,t3)-1 / f(t2)-1 - f(t3)-1 / N-1 increment l3,l2, or l1 by f(t1,t2,t3) ISSUES -- Resolutions: if 2 values are equal, increment both lambda values by (f(t1,t2,t3) / 2) ''' # temporary lambda variables tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 # for each t1,t2 in system for history in self._tri.conditions(): (h1, h2) = history # for each t3 given t1,t2 in system # (NOTE: tag actually represents (tag,C)) # However no effect within this function for tag in self._tri[history].samples(): # if there has only been 1 occurrence of this tag in the data # then ignore this trigram. if self._uni[tag] == 1: continue # safe_div provides a safe floating point division # it returns -1 if the denominator is 0 c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1)) c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1)) c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1)) # if c1 is the maximum value: if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] # if c2 is the maximum value elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] # if c3 is the maximum value elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] # if c3, and c2 are equal and larger than c1 elif (c3 == c2) and (c3 > c1): tl2 += float(self._tri[history][tag]) /2.0 tl3 += float(self._tri[history][tag]) /2.0 # if c1, and c2 are equal and larger than c3 # this might be a dumb thing to do....(not sure yet) elif (c2 == c1) and (c1 > c3): tl1 += float(self._tri[history][tag]) /2.0 tl2 += float(self._tri[history][tag]) /2.0 # otherwise there might be a problem # eg: all values = 0 else: #print "Problem", c1, c2 ,c3 pass # Lambda normalisation: # ensures that l1+l2+l3 = 1 self._l1 = tl1 / (tl1+tl2+tl3) self._l2 = tl2 / (tl1+tl2+tl3) self._l3 = tl3 / (tl1+tl2+tl3) def _safe_div(self, v1, v2): ''' Safe floating point division function, does not allow division by 0 returns -1 if the denominator is 0 ''' if v2 == 0: return -1 else: return float(v1) / float(v2) def tagdata(self, data): ''' Tags each sentence in a list of sentences :param data:list of list of words :type data: [[string,],] :return: list of list of (word, tag) tuples Invokes tag(sent) function for each sentence compiles the results into a list of tagged sentences each tagged sentence is a list of (word, tag) tuples ''' res = [] for sent in data: res1 = self.tag(sent) res.append(res1) return res def tag(self, data): ''' Tags a single sentence :param data: list of words :type data: [string,] :return: [(word, tag),] Calls recursive function '_tagword' to produce a list of tags Associates the sequence of returned tags with the correct words in the input sequence returns a list of (word, tag) tuples ''' current_state = [(['BOS', 'BOS'], 0.0)] sent = list(data) tags = self._tagword(sent, current_state) res = [] for i in range(len(sent)): # unpack and discard the C flags (t,C) = tags[i+2] res.append((sent[i], t)) return res def _tagword(self, sent, current_states): ''' :param sent : List of words remaining in the sentence :type sent : [word,] :param current_states : List of possible tag combinations for the sentence so far, and the log probability associated with each tag combination :type current_states : [([tag, ], logprob), ] Tags the first word in the sentence and recursively tags the reminder of sentence Uses formula specified above to calculate the probability of a particular tag ''' # if this word marks the end of the sentance, # return the most probable tag if sent == []: (h, logp) = current_states[0] return h # otherwise there are more words to be tagged word = sent[0] sent = sent[1:] new_states = [] # if the Capitalisation is requested, # initalise the flag for this word C = False if self._C and word[0].isupper(): C=True # if word is known # compute the set of possible tags # and their associated log probabilities if word in self._wd.conditions(): self.known += 1 for (history, curr_sent_logprob) in current_states: logprobs = [] for t in self._wd[word].samples(): p_uni = self._uni.freq((t,C)) p_bi = self._bi[history[-1]].freq((t,C)) p_tri = self._tri[tuple(history[-2:])].freq((t,C)) p_wd = float(self._wd[word][t])/float(self._uni[(t,C)]) p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri p2 = log(p, 2) + log(p_wd, 2) logprobs.append(((t,C), p2)) # compute the result of appending each tag to this history for (tag, logprob) in logprobs: new_states.append((history + [tag], curr_sent_logprob + logprob)) # otherwise a new word, set of possible tags is unknown else: self.unknown += 1 # since a set of possible tags, # and the probability of each specific tag # can not be returned from most classifiers: # specify that any unknown words are tagged with certainty p = 1 # if no unknown word tagger has been specified # then use the tag 'Unk' if self._unk is None: tag = ('Unk',C) # otherwise apply the unknown word tagger else : [(_w, t)] = list(self._unk.tag([word])) tag = (t,C) for (history, logprob) in current_states: history.append(tag) new_states = current_states # now have computed a set of possible new_states # sort states by log prob # set is now ordered greatest to least log probability new_states.sort(reverse=True, key=itemgetter(1)) # del everything after N (threshold) # this is the beam search cut if len(new_states) > self._N: new_states = new_states[:self._N] # compute the tags for the rest of the sentence # return the best list of tags for the sentence return self._tagword(sent, new_states)
for filename in files: f = open(os.path.join(corpus_path, filename), "r") metadata, raw_text = f.read().split("<!--end metadata-->") all_metadata.append(parse_metadata(metadata)) # <markdowncell> # Now that we're confident that the function works, let's find out a bit about the corpus. # As a start, it would be useful to know which years the texts are from. Are they evenly distributed over time? A graph will tell us! # <codecell> #import conditional frequency distribution from nltk.probability import ConditionalFreqDist import matplotlib % matplotlib inline cfdist = ConditionalFreqDist() for filename in os.listdir(corpus_path): text = open(os.path.join(corpus_path, filename)).read() #split text of file on 'end metadata' text = text.split("<!--end metadata-->") #parse metadata using previously defined function "parse_metadata" metadata = parse_metadata(text[0]) #skip all speeches for which there is no exact date if metadata['Date'][0] == 'c': continue #build a frequency distribution graph by year, that is, take the final bit of the 'Date' string after '/' cfdist['count'][metadata['Date'].split('/')[-1]] += 1 cfdist.plot() # <markdowncell> # Now let's build another graph, but this time by the 'Description' field:
def __init__(self, n, train, pad_left=True, pad_right=False, estimator=None, **estimator_kwargs): """ Create an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. See model.doctest for more detailed testing >>> from nltk.corpus import brown >>> lm = NgramModel(3, brown.words(categories='news')) >>> lm <NgramModel with 91603 3-grams> >>> lm._backoff <NgramModel with 62888 2-grams> >>> lm.entropy(brown.words(categories='humor')) ... # doctest: +ELLIPSIS 12.0399... :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list(str) or list(list(str)) :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings :type pad_left: bool :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings :type pad_right: bool :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () # make sure n is greater than zero, otherwise print it assert (n > 0), n # For explicitness save the check whether this is a unigram model self.is_unigram_model = (n == 1) # save the ngram order number self._n = n # save left and right padding self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() # set read-only ngrams set (see property declaration below to reconfigure) self._ngrams = set() ''' # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], compat.string_types): train = [train] ''' # we need to keep track of the number of word types we encounter vocabulary = set() count = 0 #for review in train: for review in read_reviews(train): count += 1 if count % 10000 == 0: print str(count) + ' reviews processed' #for testing with small training set #if count > 100000: # break #newly added, each element is dict of each review review_text = review['text'] #separate into tokens, lowercase tokens = word_tokenize(review_text) tokens = [w.lower() for w in tokens] #updated for new nltk api raw_ngrams = ngrams(tokens, n, pad_left, pad_right, left_pad_symbol='', right_pad_symbol='...EOR...') for ngram in raw_ngrams: self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context][token] += 1 vocabulary.add(token) # Unless number of bins is explicitly passed, we should use the number # of word types encountered during training as the bins value. # If right padding is on, this includes the padding symbol. if 'bins' not in estimator_kwargs: estimator_kwargs['bins'] = len(vocabulary) * 2 self._model = ConditionalProbDist(cfd, estimator, **estimator_kwargs) self._probdist = self._model # recursively construct the lower-order models if not self.is_unigram_model: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, **estimator_kwargs) self._backoff_alphas = dict() # For each condition (or context) for ctxt in cfd.conditions(): backoff_ctxt = ctxt[1:] backoff_total_pr = 0.0 total_observed_pr = 0.0 # this is the subset of words that we OBSERVED following # this context. # i.e. Count(word | context) > 0 for word in self._words_following(ctxt, cfd): total_observed_pr += self.prob(word, ctxt) # we also need the total (n-1)-gram probability of # words observed in this n-gram context backoff_total_pr += self._backoff.prob(word, backoff_ctxt) assert (0 <= total_observed_pr <= 1), total_observed_pr # beta is the remaining probability weight after we factor out # the probability of observed words. # As a sanity check, both total_observed_pr and backoff_total_pr # must be GE 0, since probabilities are never negative beta = 1.0 - total_observed_pr # backoff total has to be less than one, otherwise we get # an error when we try subtracting it from 1 in the denominator assert (0 <= backoff_total_pr < 1), backoff_total_pr alpha_ctxt = beta / (1.0 - backoff_total_pr) self._backoff_alphas[ctxt] = alpha_ctxt
class ContextIndex(object): """ A bidirectional index between words and their 'contexts' in a text. The context of a word is usually defined to be the words that occur in a fixed window around the word; but other definitions may also be used by providing a custom context function. """ @staticmethod def _default_context(tokens, i): """One left token and one right token, normalized to lowercase""" left = tokens[i - 1].lower() if i != 0 else '*START*' right = tokens[i + 1].lower() if i != len(tokens) - 1 else '*END*' return (left, right) def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD( (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens) ) self._context_to_words = CFD( (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens) ) def tokens(self): """ :rtype: list(str) :return: The document that this context index was created from. """ return self._tokens def word_similarity_dict(self, word): """ Return a dictionary mapping from words to 'similarity scores,' indicating how often these two words occur in the same context. """ word = self._key(word) word_contexts = set(self._word_to_contexts[word]) scores = {} for w, w_contexts in self._word_to_contexts.items(): scores[w] = f_measure(word_contexts, set(w_contexts)) return scores def similar_words(self, word, n=20): scores = defaultdict(int) for c in self._word_to_contexts[self._key(word)]: for w in self._context_to_words[c]: if w != word: scores[w] += ( self._context_to_words[c][word] * self._context_to_words[c][w] ) return sorted(scores, key=scores.get, reverse=True)[:n] def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist( c for w in words for c in self._word_to_contexts[w] if c in common ) return fd
print "Negative: " + str(n) print "Neutral: " + str(nt) # Extracting features # Using the feature set provided #fvecs = [(make_tweet_dict(t),s) for (t,s) in tweets] # Extracting features from data fvecs = [(get_tweet_features(t, set()),s) for (t,s) in tweets] #pprint.pprint(fvecs) # Extract best word features word_fd = FreqDist() label_word_fd = ConditionalFreqDist() # for (feats, label) in fvecs: #print label for key in feats: #print key if feats[key]: word_fd.inc(key) #print word_fd label_word_fd[label].inc(key) #print label_word_fd[label] # ##print word_fd['positive'] ##print label_word_fd print label_word_fd.conditions() cls_set=label_word_fd.conditions()
from nltk.corpus import brown from nltk.probability import ConditionalFreqDist genres = ['news', 'romance'] days = set("Monday Tuesday Wednesday Thursday Friday Saturday Sunday".split()) cfd = ConditionalFreqDist( (genre,word) for genre in genres for word in brown.words(categories=genre) if word in days ) cfd.tabulate()
bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)]) evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.chi_sq))#Works best for this Data #evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.jaccard)) #evaluations.append(evaluate_classifier(bigram_word_feats,BigramAssocMeasures.likelihood_ratio)) # In[3]: from nltk.collocations import * from nltk.probability import FreqDist from nltk.probability import ConditionalFreqDist word_fd = FreqDist() label_word_fd = ConditionalFreqDist() testNegWords = movie_reviews.words(categories=['pos']) testPosWords = movie_reviews.words(categories=['neg']) for word in testNegWords: word_fd[word.lower()]+=1 label_word_fd['neg'][word.lower()]+=1 for word in testPosWords: word_fd[word.lower()]+=1 label_word_fd['pos'][word.lower()]+=1 print(word_fd.N(),word_fd.B(),word_fd.most_common(20)) print(label_word_fd.N(),label_word_fd.conditions(),label_word_fd.items()) print(label_word_fd['pos'].N(),label_word_fd['neg'].N())