def fun14(): """counting other things""" # print [len(w) for w in text1] fdist1 = FreqDist([len(w) for w in text1]) # print fdist1.keys() # print fdist1.items() # word length 3 => 50223 print fdist1[3] print fdist1.max() # frequency 20% print fdist1.freq(3)
def binary_stump(feature_name, feature_value, labeled_featuresets): label = FreqDist([label for (featureset,label) in labeled_featuresets]).max() # Find the best label for each value. pos_fdist = FreqDist() neg_fdist = FreqDist() for featureset, label in labeled_featuresets: if featureset.get(feature_name) == feature_value: pos_fdist.inc(label) else: neg_fdist.inc(label) decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())} default = DecisionTreeClassifier(neg_fdist.max()) return DecisionTreeClassifier(label, feature_name, decisions, default)
def classify(self, feats): counts = FreqDist() for classifier in self._classifiers: counts.inc(classifier.classify(feats)) return counts.max()
def choose_tag(self, tokens, index, history): tags = FreqDist() for tagger in self._taggers: tags.inc(tagger.choose_tag(tokens, index, history)) return tags.max()
def choose_tag(self, tokens, index, history): word = tokens[index] fd = FreqDist() for synset in wordnet.synsets(word): fd.inc(synset.pos) return self.wordnet_tag_map.get(fd.max())
def classify(self, feat): '''Return the label with the most agreement among classifiers''' label_freqs = FreqDist() for classifier in self._classifiers: label_freqs.inc(classifier.classify(feat)) return label_freqs.max()
def shiftByAlpha(alphas, cipherText, common, reverse): key = [] for alpha in alphas: fdist = FreqDist(alpha) if reverse: shift = (ord(common) - ord(fdist.max())) else: shift = (ord(fdist.max()) - ord(common)) key.append(shift) print('shift ' + str(shift)) keyLen = len(key) res = '' for i in range(0, len(cipherText)): c = chr((ord(cipherText[i]) + key[i%keyLen])%128) res += c print (res)
def __compute_tf__(self, term, doc_terms): """ Computes the normalized frequency of term t in document d, which is the number of times t occurs in d divided by the maximum number of times any term occurs in d: tf(t,d) = f(t,d) / max{f(w,d)} """ fdist = FreqDist(term.lower() for term in doc_terms) max_freq = doc_terms.count(fdist.max()) if max_freq==0: return 0.0 return float(doc_terms.count(term)) / max_freq
def choose_tag(self, tokens, index, history): word = tokens[index] fd = FreqDist() for synset in wordnet.synsets(word): fd[synset.pos()] += 1 if not fd: return None return self.wordnet_tag_map.get(fd.max())
def binary_stump(feature_name, feature_value, labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() # Find the best label for each value. pos_fdist = FreqDist() neg_fdist = FreqDist() for featureset, label in labeled_featuresets: if featureset.get(feature_name) == feature_value: pos_fdist[label] += 1 else: neg_fdist[label] += 1 decisions = {} default = label # But hopefully we have observations! if pos_fdist.N() > 0: decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())} if neg_fdist.N() > 0: default = DecisionTreeClassifier(neg_fdist.max()) return DecisionTreeClassifier(label, feature_name, decisions, default)
def choose_tag(self, tokens, index, history): word = tokens[index] if word is None: return None fd = FreqDist() for synset in wordnet.synsets(word): fd[synset.pos] += 1 try: return self.wordnet_tag_map.get(fd.max()) except: # in case fd is empty return None
def worst_errors_many_wrong_decisions(self, k, feature_extractor): worst_errors = [] features = [] wrongDocs = self.error_prediction_docs(self.maintest, self.testClassify) for doc in wrongDocs: feature_dic = feature_extractor(movie_reviews.words(fileids=[doc])) features = features + feature_dic.keys() fd = FreqDist(feature.lower() for feature in features) for i in range(1, k+1): x = fd.max() fd.pop(x) worst_errors.append(x) return worst_errors
def get_best_answers(self, passage_list, q): logger = logging.getLogger("qa_logger") logger.info("%s:\tAnswer Processing", q.id_q) empty = passage_list == [] logger.info("%s:\t\tAnswer Extraction", q.id_q) answer_list = [] for passage in passage_list: a = passage.find_answer(q) if a.is_successful(): answer_list.append(a) if not answer_list: return ([], empty) logger.info("%s:\t\tAnswer Filtering", q.id_q) # Obtain answer frequency fd = FreqDist(answer_list) # Normalize frequencies normalize = fd.freq(fd.max()) # Modify scores by frequency for answer in answer_list: answer.score = int(answer.score * (fd.freq(answer) / normalize)) # Sort answers by score answer_list.sort(key=lambda x: x.score, reverse=True) # Filter bad answers try: threshold = int(MyConfig.get("answer_filtering", "threshold")) except: logger = logging.getLogger("qa_logger") logger.error("answer quality threshold not found") threshold = 50 answer_list = filter(lambda x: x.score > threshold, answer_list) final_answers = [] for a in answer_list: if a not in final_answers: final_answers.append(a) if len(final_answers) == 3: break return (final_answers, empty)
def xorByAlpha(alphas, cipherText, common): key = [] for alpha in alphas: fdist = FreqDist(alpha) kxor = (ord(fdist.max()) ^ ord(common)) key.append(kxor) keyLen = len(key) res = '' for i in range(0, len(cipherText)): c = chr((ord(cipherText[i]) ^ key[i%keyLen])) res += c print (res)
def binary_stump(feature_name, feature_value, labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() # Find the best label for each value. pos_fdist = FreqDist() neg_fdist = FreqDist() for featureset, label in labeled_featuresets: if featureset.get(feature_name) == feature_value: pos_fdist[label] += 1 else: neg_fdist[label] += 1 decisions = {} default = label # But hopefully we have observations! if pos_fdist.N() > 0: decisions = { feature_value: DecisionTreeClassifier(pos_fdist.max()) } if neg_fdist.N() > 0: default = DecisionTreeClassifier(neg_fdist.max()) return DecisionTreeClassifier(label, feature_name, decisions, default)
def _entity_ranking(self, entities): if len(entities) == 0: return "", "", int(0) # Obtain frequency of entities entities_freq = FreqDist(entities) # Our answer is the sample with the greatest number of outcomes exact = entities_freq.max() # Our window is empty because this algorithm generates exact answers window = "" # Our score is the entity frequency score = int(entities_freq.freq(exact) * 1000) return exact, window, score
def choose_tag(self, tokens, index, history): context = self.context(tokens, index, history) s = self._morph.parse(tokens[index]) tags = [unicode(x.tag).replace(u' ', u',') for x in s] if len(tags) == 0: return None if (len(tags) == 1) or not (context in self._contexts_to_tags.keys()): return tags[0] tagsconts = FreqDist() for tag in tags: #print 'TAG: ', tag #print tokens[index] tagsconts[tag] = self._contexts_to_tags[context].get(tag, 0) #print 'PROB: | ', context, tagsconts[tag] best_tag = tagsconts.max() if tagsconts[best_tag] == 0: return tags[0] return best_tag
def next(self, s, method = MOST_LIKELY): # Pick a transition leaving state s and return a state that would # likely follow. The next state is chosen according to the method # specified. The default is to choose and return the most likely # transition state. # determine all states adjacent to s transitions = self._adjacentVertices[s] freqDist = FreqDist() # determine the weights of the edges between state s and all adjacent states for state in transitions: freqDist.inc(state) if method == MarkovChain.MOST_LIKELY: return freqDist.max() elif method == MarkovChain.LEAST_LIKELY: # NLTK provides no built-in method to return the minimum of a # frequency distribution so for now, we get a list of samples # sorted in decreasing order and grab the last one. return freqDist.sorted_samples()[-1] else: # choose a real number between 0 and 1 x = uniform(0,1) # choose next state based on weights of the edges. Randomness plays a part here. for i in range(len(transitions)): probability = freqDist.freq(transitions[i]) if x < probability: return transitions[i] x = x - probability exc = "Error in MarkovChain.next(). Did not find next state.\n" raise exc
class WordNetTagger(SequentialBackoffTagger): """ Class implementation of the wordnet tagger """ def __init__(self, *args, **kwargs): SequentialBackoffTagger.__init__(self, *args, **kwargs) self.wordnet_tag_map = { 'n': 'NN', 's': 'JJ', 'a': 'JJ', 'r': 'RB', 'v': 'VB' } self.fd = FreqDist(treebank.words()) def choose_tag(self, tokens, index, history): """ Choses a POS tag based on the wordnet tag """ word = tokens[index] for synset in wordnet.synsets(word): self.fd[synset.pos()] += 1 return self.wordnet_tag_map.get(self.fd.max())
print("2.", len(cess_esp.words())) # 3 print("3.", len(cess_esp.sents())) # 4 from nltk.probability import FreqDist first_file = cess_esp.fileids()[0] cess_freq0 = FreqDist(cess_esp.words(first_file)) print("4.", cess_freq0.most_common(20)) # 5 print("5.", [w for w, k in cess_freq0.most_common()]) # 6 print("6.", [w for w, k in cess_freq0.items() if len(w) > 7 and k > 2]) # 7 print("7.", [k for w, k in cess_freq0.most_common()]) print("7b. Freq de aparición de la preposición a", cess_freq0.get("a", 0)) # 8 print("8. No de palabras que aparecen una sola vez:", len([w for w, k in cess_freq0.items() if k == 1])) # 9 print("9. La palabra más frecuente es", cess_freq0.max()) # 10 from nltk.corpus import PlaintextCorpusReader mycorpus = PlaintextCorpusReader("../res/", ".*") # 11 print("11.") for doc in mycorpus.fileids(): print(doc, len(mycorpus.words(doc)), len(set(mycorpus.words(doc))), len(mycorpus.sents(doc)))
freq_dist.inc(token['TEXT']) # How many times did "the" occur? freq_dist.count('the') # What was the frequency of the word "the"? freq_dist.freq('the') # How many word tokens were counted? freq_dist.N() # What word types were encountered? freq_dist.samples() # What was the most common word? freq_dist.max() # What is the distribution of word lengths in a corpus? freq_dist = FreqDist() for token in corpus['SUBTOKENS']: freq_dist.inc(len(token['TEXT'])) # Plot the results. wordlens = freq_dist.samples() # Ordena a lista wordlens.sort() # cria uma tupla com um numero de frequencia e a sua # respectiva distribuicao # para visualizar execute o comanto print points
def default_tag(tagged_sents): tag_fd = FreqDist() for sent in tagged_sents: for word, postag in sent: tag_fd.inc(postag) return str(tag_fd.max())
from nltk.corpus import cess_esp from nltk.probability import FreqDist fdist = FreqDist(cess_esp.words(cess_esp.fileids()[0])) print("La palabra mas frecuente es ", fdist.max())
def classify(self, feats): counts = FreqDist() for classifier in self._classifiers: counts[classifier.classify(feats)] += 1 return counts.max()
#print "lines: ", len(lines) for line in lines: # print n, line.encode('utf-8') line_tokens = tokenizer.tokenize(line) #for token in line_tokens: # print token.encode('utf-8'), " | " #n = n + 1 text_array.append(line_tokens) #now try to match hyphenated lines with their #correpsonding beginning lines n = 0 for line in text_array: if len(line) > 0: if line[-1][-1] == '-': try: line[-1] = line[-1][:-1] + text_array[n + 1][0] text_array[n + 1] = text_array[n + 1][1:] except IndexError as e: print e n = n + 1 #now flatten the 2d array tokens = [item for sublist in text_array for item in sublist] tokens = delete_non_greek_tokens(tokens) for token in tokens: fdist.inc(token) print "most common: ", fdist.max().encode('utf-8') for item in fdist.keys(): print item.encode('utf-8'), fdist.freq(item)
wc += 1 # loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags if not isinstance(tag, basestring): tag = str(tag) tag_counts.inc(tag) word_set.add(word) ############ ## output ## ############ print '%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set), len(tag_counts)) if args.sort == 'tag': sort_key = lambda (t, c): t elif args.sort == 'count': sort_key = lambda (t, c): c else: raise ValueError('%s is not a valid sort option' % args.sort) countlen = max(len(str(tag_counts[tag_counts.max()])) + 2, 9) # simple reSt table format print ' '.join(['Tag'.center(taglen), 'Count'.center(countlen)]) print ' '.join(['=' * taglen, '=' * (countlen)]) for tag, count in sorted(tag_counts.items(), key=sort_key, reverse=args.reverse): print ' '.join([tag.ljust(taglen), str(count).rjust(countlen)]) print ' '.join(['=' * taglen, '=' * (countlen)])
cfd = ConditionalFreqDist() ### get the (token,tag) pair for each tagged sentence i = 1 for sentence in brown.tagged_sents(): for (token, tag) in sentence: if i < 6: print(token, tag) fd.inc(tag) cfd[token].inc(tag) i += 1 ### the most frequent tag: print fd.max() wordbins = [] for token in cfd.conditions(): wordbins.append((cfd[token].B(), token)) ### sort tuples by number of unique tags wordbins.sort(reverse=True) print wordbins[0:3] ### masculine pronouns male = ['he', 'his', 'him', 'himself'] female = ['she', 'hers', 'her', 'herself'] n_male, n_female = 0, 0
if args.corpus in ['conll2000', 'switchboard'] and simplify_wsj_tag and args.simplify_tags: tag = simplify_wsj_tag(tag) wc += 1 # loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags if not isinstance(tag, basestring): tag = str(tag) tag_counts.inc(tag) word_set.add(word) ############ ## output ## ############ print('%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set), len(tag_counts))) if args.sort == 'tag': sort_key = lambda tc: tc[0] elif args.sort == 'count': sort_key = lambda tc: tc[1] else: raise ValueError('%s is not a valid sort option' % args.sort) countlen = max(len(str(tag_counts[tag_counts.max()])) + 2, 9) # simple reSt table format print(' '.join(['Tag'.center(taglen), 'Count'.center(countlen)])) print(' '.join(['='*taglen, '='*(countlen)])) for tag, count in sorted(tag_counts.items(), key=sort_key, reverse=args.reverse): print(' '.join([tag.ljust(taglen), str(count).rjust(countlen)])) print(' '.join(['='*taglen, '='*(countlen)]))
# print "lines: ", len(lines) for line in lines: # print n, line.encode('utf-8') line_tokens = tokenizer.tokenize(line) # for token in line_tokens: # print token.encode('utf-8'), " | " # n = n + 1 text_array.append(line_tokens) # now try to match hyphenated lines with their # correpsonding beginning lines n = 0 for line in text_array: if len(line) > 0: if line[-1][-1] == "-": try: line[-1] = line[-1][:-1] + text_array[n + 1][0] text_array[n + 1] = text_array[n + 1][1:] except IndexError as e: print e n = n + 1 # now flatten the 2d array tokens = [item for sublist in text_array for item in sublist] tokens = delete_non_greek_tokens(tokens) for token in tokens: fdist.inc(token) print "most common: ", fdist.max().encode("utf-8") for item in fdist.keys(): print item.encode("utf-8"), fdist.freq(item)