コード例 #1
0
ファイル: ch01.py プロジェクト: gree2/hobby
def fun14():
    """counting other things"""
    # print [len(w) for w in text1]
    fdist1 = FreqDist([len(w) for w in text1])
    # print fdist1.keys()
    # print fdist1.items()
    # word length 3 => 50223
    print fdist1[3]
    print fdist1.max()
    # frequency 20%
    print fdist1.freq(3)
コード例 #2
0
ファイル: decisiontree.py プロジェクト: Joselin/nltk
    def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist([label for (featureset,label)
                          in labeled_featuresets]).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist.inc(label)
            else:
                neg_fdist.inc(label)

        decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
        default = DecisionTreeClassifier(neg_fdist.max())
        return DecisionTreeClassifier(label, feature_name, decisions, default)
	def classify(self, feats):
		counts = FreqDist()
		
		for classifier in self._classifiers:
			counts.inc(classifier.classify(feats))
		
		return counts.max()
コード例 #4
0
ファイル: taggers.py プロジェクト: ANB2/nltk-trainer
	def choose_tag(self, tokens, index, history):
		tags = FreqDist()
		
		for tagger in self._taggers:
			tags.inc(tagger.choose_tag(tokens, index, history))
		
		return tags.max()
コード例 #5
0
	def choose_tag(self, tokens, index, history):
		word = tokens[index]
		fd = FreqDist()
		
		for synset in wordnet.synsets(word):
			fd.inc(synset.pos)
		
		return self.wordnet_tag_map.get(fd.max())
コード例 #6
0
ファイル: multi.py プロジェクト: ANB2/nltk-trainer
	def classify(self, feat):
		'''Return the label with the most agreement among classifiers'''
		label_freqs = FreqDist()
		
		for classifier in self._classifiers:
			label_freqs.inc(classifier.classify(feat))
		
		return label_freqs.max()
コード例 #7
0
ファイル: shift.py プロジェクト: Argonaught/playground
def shiftByAlpha(alphas, cipherText, common, reverse):
	key = []
	for alpha in alphas:
		fdist = FreqDist(alpha)
		if reverse:
			shift = (ord(common) - ord(fdist.max()))
		else:
			shift = (ord(fdist.max()) - ord(common))
		key.append(shift)
		print('shift ' + str(shift))

	keyLen = len(key)
	res = ''
	for i in range(0, len(cipherText)):
		c = chr((ord(cipherText[i]) + key[i%keyLen])%128)
		res += c

	print (res)
コード例 #8
0
ファイル: tfidf.py プロジェクト: adityajoshi5/RESLVE
 def __compute_tf__(self, term, doc_terms):
     """ Computes the normalized frequency of term t in document d, which 
     is the number of times t occurs in d divided by the maximum number 
     of times any term occurs in d: tf(t,d) = f(t,d) / max{f(w,d)} """
     fdist = FreqDist(term.lower() for term in doc_terms)
     max_freq = doc_terms.count(fdist.max())
     if max_freq==0:
         return 0.0
     return float(doc_terms.count(term)) / max_freq
コード例 #9
0
ファイル: taggers.py プロジェクト: ShunyuanZ/nltk3-cookbook
	def choose_tag(self, tokens, index, history):
		word = tokens[index]
		fd = FreqDist()
		
		for synset in wordnet.synsets(word):
			fd[synset.pos()] += 1
		
		if not fd: return None
		return self.wordnet_tag_map.get(fd.max())
コード例 #10
0
ファイル: decisiontree.py プロジェクト: prz3m/kind2anki
    def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist(label for (featureset, label) in labeled_featuresets).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist[label] += 1
            else:
                neg_fdist[label] += 1

        decisions = {}
        default = label
        # But hopefully we have observations!
        if pos_fdist.N() > 0:
            decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
        if neg_fdist.N() > 0:
            default = DecisionTreeClassifier(neg_fdist.max())

        return DecisionTreeClassifier(label, feature_name, decisions, default)
コード例 #11
0
 def choose_tag(self, tokens, index, history):
     word = tokens[index]
     if word is None:
         return None
     fd = FreqDist()
     
     for synset in wordnet.synsets(word):
         fd[synset.pos] += 1
     try:
         return self.wordnet_tag_map.get(fd.max())
     except:  # in case fd is empty
         return None
コード例 #12
0
ファイル: q2_1.py プロジェクト: atiassa/recommend-2011
 def worst_errors_many_wrong_decisions(self, k, feature_extractor):
     worst_errors = []
     features = []
     wrongDocs = self.error_prediction_docs(self.maintest, self.testClassify)
     for doc in wrongDocs:
         feature_dic = feature_extractor(movie_reviews.words(fileids=[doc]))
         features = features + feature_dic.keys()
     fd = FreqDist(feature.lower() for feature in features)
     for i in range(1, k+1):
         x = fd.max()
         fd.pop(x)
         worst_errors.append(x)
     return worst_errors
コード例 #13
0
ファイル: QA.py プロジェクト: danigarabato/qa
    def get_best_answers(self, passage_list, q):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tAnswer Processing", q.id_q)

        empty = passage_list == []

        logger.info("%s:\t\tAnswer Extraction", q.id_q)

        answer_list = []
        for passage in passage_list:
            a = passage.find_answer(q)
            if a.is_successful():
                answer_list.append(a)

        if not answer_list:
            return ([], empty)

        logger.info("%s:\t\tAnswer Filtering", q.id_q)

        # Obtain answer frequency
        fd = FreqDist(answer_list)

        # Normalize frequencies
        normalize = fd.freq(fd.max())

        # Modify scores by frequency
        for answer in answer_list:
            answer.score = int(answer.score * (fd.freq(answer) / normalize))

        # Sort answers by score
        answer_list.sort(key=lambda x: x.score, reverse=True)

        # Filter bad answers
        try:
            threshold = int(MyConfig.get("answer_filtering", "threshold"))
        except:
            logger = logging.getLogger("qa_logger")
            logger.error("answer quality threshold not found")
            threshold = 50

        answer_list = filter(lambda x: x.score > threshold, answer_list)

        final_answers = []
        for a in answer_list:
            if a not in final_answers:
                final_answers.append(a)
            if len(final_answers) == 3:
                break

        return (final_answers, empty)
コード例 #14
0
ファイル: xor.py プロジェクト: Argonaught/playground
def xorByAlpha(alphas, cipherText, common):
	key = []
	for alpha in alphas:
		fdist = FreqDist(alpha)
		kxor = (ord(fdist.max()) ^ ord(common))
		key.append(kxor)

	keyLen = len(key)
	res = ''
	for i in range(0, len(cipherText)):
		c = chr((ord(cipherText[i]) ^  key[i%keyLen]))
		res += c

	print (res)
コード例 #15
0
    def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist(label
                         for (featureset, label) in labeled_featuresets).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist[label] += 1
            else:
                neg_fdist[label] += 1

        decisions = {}
        default = label
        # But hopefully we have observations!
        if pos_fdist.N() > 0:
            decisions = {
                feature_value: DecisionTreeClassifier(pos_fdist.max())
            }
        if neg_fdist.N() > 0:
            default = DecisionTreeClassifier(neg_fdist.max())

        return DecisionTreeClassifier(label, feature_name, decisions, default)
コード例 #16
0
ファイル: answer.py プロジェクト: nrvnujd/qa
    def _entity_ranking(self, entities):
        if len(entities) == 0:
            return "", "", int(0)

        # Obtain frequency of entities
        entities_freq = FreqDist(entities)

        # Our answer is the sample with the greatest number of outcomes
        exact = entities_freq.max()

        # Our window is empty because this algorithm generates exact answers
        window = ""

        # Our score is the entity frequency
        score = int(entities_freq.freq(exact) * 1000)

        return exact, window, score
コード例 #17
0
ファイル: answer.py プロジェクト: danigarabato/qa
    def _entity_ranking(self, entities):
        if len(entities) == 0:
            return "", "", int(0)

        # Obtain frequency of entities
        entities_freq = FreqDist(entities)

        # Our answer is the sample with the greatest number of outcomes
        exact = entities_freq.max()

        # Our window is empty because this algorithm generates exact answers
        window = ""

        # Our score is the entity frequency
        score = int(entities_freq.freq(exact) * 1000)

        return exact, window, score
コード例 #18
0
ファイル: tagger.py プロジェクト: 0623forbidden/nltk4russian
    def choose_tag(self, tokens, index, history):
        context = self.context(tokens, index, history)

        s = self._morph.parse(tokens[index])
        tags = [unicode(x.tag).replace(u' ', u',') for x in s]
        if len(tags) == 0:
            return None
        if (len(tags) == 1) or not (context in self._contexts_to_tags.keys()):
            return tags[0]

        tagsconts = FreqDist()
        for tag in tags:
            #print 'TAG: ', tag
            #print tokens[index]
            tagsconts[tag] = self._contexts_to_tags[context].get(tag, 0)

            #print 'PROB: | ', context, tagsconts[tag]
        best_tag = tagsconts.max()
        if tagsconts[best_tag] == 0:
            return tags[0]
        return best_tag
コード例 #19
0
ファイル: MarkovChain.py プロジェクト: chrispenick/pynlg
    def next(self, s, method = MOST_LIKELY):
        # Pick a transition leaving state s and return a state that would
        # likely follow.  The next state is chosen according to the method
        # specified.  The default is to choose and return the most likely
        # transition state.

        # determine all states adjacent to s
        transitions = self._adjacentVertices[s]
        freqDist = FreqDist()

        # determine the weights of the edges between state s and all adjacent states
        for state in transitions:
            freqDist.inc(state)

        if method == MarkovChain.MOST_LIKELY:
            return freqDist.max()

        elif method == MarkovChain.LEAST_LIKELY:
            # NLTK provides no built-in method to return the minimum of a
            # frequency distribution so for now, we get a list of samples
            # sorted in decreasing order and grab the last one.

            return freqDist.sorted_samples()[-1]

        else:
            # choose a real number between 0 and 1
            x = uniform(0,1)
            
            # choose next state based on weights of the edges.  Randomness plays a part here.
            for i in range(len(transitions)):
                probability = freqDist.freq(transitions[i])
             
                if x < probability:
                    return transitions[i]

                x = x - probability

            exc = "Error in MarkovChain.next().  Did not find next state.\n"
            raise exc
コード例 #20
0
class WordNetTagger(SequentialBackoffTagger):
    """
        Class implementation of the wordnet tagger
    """
    def __init__(self, *args, **kwargs):
        SequentialBackoffTagger.__init__(self, *args, **kwargs)
        self.wordnet_tag_map = {
            'n': 'NN',
            's': 'JJ',
            'a': 'JJ',
            'r': 'RB',
            'v': 'VB'
        }
        self.fd = FreqDist(treebank.words())

    def choose_tag(self, tokens, index, history):
        """
            Choses a POS tag based on the wordnet tag
        """

        word = tokens[index]
        for synset in wordnet.synsets(word):
            self.fd[synset.pos()] += 1
        return self.wordnet_tag_map.get(self.fd.max())
コード例 #21
0
print("2.", len(cess_esp.words()))
# 3
print("3.", len(cess_esp.sents()))
# 4
from nltk.probability import FreqDist

first_file = cess_esp.fileids()[0]
cess_freq0 = FreqDist(cess_esp.words(first_file))
print("4.", cess_freq0.most_common(20))
# 5
print("5.", [w for w, k in cess_freq0.most_common()])
# 6
print("6.", [w for w, k in cess_freq0.items() if len(w) > 7 and k > 2])
# 7
print("7.", [k for w, k in cess_freq0.most_common()])
print("7b. Freq de aparición de la preposición a", cess_freq0.get("a", 0))
# 8
print("8. No de palabras que aparecen una sola vez:",
      len([w for w, k in cess_freq0.items() if k == 1]))
# 9
print("9. La palabra más frecuente es", cess_freq0.max())
# 10
from nltk.corpus import PlaintextCorpusReader

mycorpus = PlaintextCorpusReader("../res/", ".*")
# 11
print("11.")
for doc in mycorpus.fileids():
    print(doc, len(mycorpus.words(doc)), len(set(mycorpus.words(doc))),
          len(mycorpus.sents(doc)))
コード例 #22
0
	freq_dist.inc(token['TEXT'])

# How many times did "the" occur?
freq_dist.count('the')

# What was the frequency of the word "the"?
freq_dist.freq('the')

# How many word tokens were counted?
freq_dist.N()

# What word types were encountered?
freq_dist.samples()

# What was the most common word?
freq_dist.max()

# What is the distribution of word lengths in a corpus?
freq_dist = FreqDist()
for token in corpus['SUBTOKENS']:
	freq_dist.inc(len(token['TEXT']))

# Plot the results.
wordlens = freq_dist.samples()

# Ordena a lista
wordlens.sort()

# cria uma tupla com um numero de frequencia e a sua
# respectiva distribuicao
# para visualizar execute o comanto print points
コード例 #23
0
def default_tag(tagged_sents):
    tag_fd = FreqDist()
    for sent in tagged_sents:
        for word, postag in sent:
            tag_fd.inc(postag)
    return str(tag_fd.max())
コード例 #24
0
from nltk.corpus import cess_esp
from nltk.probability import FreqDist

fdist = FreqDist(cess_esp.words(cess_esp.fileids()[0]))
print("La palabra mas frecuente es ", fdist.max())
コード例 #25
0
 def classify(self, feats):
     counts = FreqDist()
     for classifier in self._classifiers:
         counts[classifier.classify(feats)] += 1
     return counts.max()
コード例 #26
0
    #print "lines: ", len(lines)
    for line in lines:
        #       print n, line.encode('utf-8')
        line_tokens = tokenizer.tokenize(line)
        #for token in line_tokens:
        #       print token.encode('utf-8'), " | "
        #n = n + 1
        text_array.append(line_tokens)

#now try to match hyphenated lines with their
#correpsonding beginning lines
n = 0
for line in text_array:
    if len(line) > 0:
        if line[-1][-1] == '-':
            try:
                line[-1] = line[-1][:-1] + text_array[n + 1][0]
                text_array[n + 1] = text_array[n + 1][1:]
            except IndexError as e:
                print e
    n = n + 1
#now flatten the 2d array
tokens = [item for sublist in text_array for item in sublist]
tokens = delete_non_greek_tokens(tokens)
for token in tokens:
    fdist.inc(token)

print "most common: ", fdist.max().encode('utf-8')
for item in fdist.keys():
    print item.encode('utf-8'), fdist.freq(item)
    wc += 1
    # loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
    if not isinstance(tag, basestring): tag = str(tag)
    tag_counts.inc(tag)
    word_set.add(word)

############
## output ##
############

print '%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set),
                                                      len(tag_counts))

if args.sort == 'tag':
    sort_key = lambda (t, c): t
elif args.sort == 'count':
    sort_key = lambda (t, c): c
else:
    raise ValueError('%s is not a valid sort option' % args.sort)

countlen = max(len(str(tag_counts[tag_counts.max()])) + 2, 9)
# simple reSt table format
print '  '.join(['Tag'.center(taglen), 'Count'.center(countlen)])
print '  '.join(['=' * taglen, '=' * (countlen)])

for tag, count in sorted(tag_counts.items(),
                         key=sort_key,
                         reverse=args.reverse):
    print '  '.join([tag.ljust(taglen), str(count).rjust(countlen)])

print '  '.join(['=' * taglen, '=' * (countlen)])
コード例 #28
0
ファイル: corpusExplore.py プロジェクト: Migisa/Misc
cfd = ConditionalFreqDist()


### get the (token,tag) pair for each tagged sentence
i = 1

for sentence in brown.tagged_sents():
	for (token, tag) in sentence:
		if i < 6:		
			print(token, tag)
		fd.inc(tag)
		cfd[token].inc(tag)
		i += 1

### the most frequent tag:
print fd.max()

wordbins = []
for token in cfd.conditions():
	wordbins.append((cfd[token].B(), token))


### sort tuples by number of unique tags
wordbins.sort(reverse=True)
print wordbins[0:3]


### masculine pronouns
male = ['he', 'his', 'him', 'himself']
female = ['she', 'hers', 'her', 'herself']
n_male, n_female = 0, 0 
コード例 #29
0
	if args.corpus in ['conll2000', 'switchboard'] and simplify_wsj_tag and args.simplify_tags:
		tag = simplify_wsj_tag(tag)
	
	wc += 1
	# loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
	if not isinstance(tag, basestring): tag = str(tag)
	tag_counts.inc(tag)
	word_set.add(word)

############
## output ##
############

print('%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set), len(tag_counts)))

if args.sort == 'tag':
	sort_key = lambda tc: tc[0]
elif args.sort == 'count':
	sort_key = lambda tc: tc[1]
else:
	raise ValueError('%s is not a valid sort option' % args.sort)

countlen = max(len(str(tag_counts[tag_counts.max()])) + 2, 9)
# simple reSt table format
print('  '.join(['Tag'.center(taglen), 'Count'.center(countlen)]))
print('  '.join(['='*taglen, '='*(countlen)]))

for tag, count in sorted(tag_counts.items(), key=sort_key, reverse=args.reverse):
	print('  '.join([tag.ljust(taglen), str(count).rjust(countlen)]))

print('  '.join(['='*taglen, '='*(countlen)]))
コード例 #30
0
 def choose_tag(self, tokens, index, history):
     word = tokens[index]
     fd = FreqDist()
     for synset in wordnet.synsets(word):
         fd.inc(synset.pos)
     return self.wordnet_tag_map.get(fd.max())
コード例 #31
0
ファイル: brown_tagger_stub.py プロジェクト: wbkdef/NLP
def default_tag(tagged_sents):
    tag_fd = FreqDist()
    for sent in tagged_sents:
        for word, postag in sent:
            tag_fd.inc(postag)
    return str(tag_fd.max())
コード例 #32
0
    # print "lines: ", len(lines)
    for line in lines:
        #       print n, line.encode('utf-8')
        line_tokens = tokenizer.tokenize(line)
        # for token in line_tokens:
        #       print token.encode('utf-8'), " | "
        # n = n + 1
        text_array.append(line_tokens)

    # now try to match hyphenated lines with their
    # correpsonding beginning lines
n = 0
for line in text_array:
    if len(line) > 0:
        if line[-1][-1] == "-":
            try:
                line[-1] = line[-1][:-1] + text_array[n + 1][0]
                text_array[n + 1] = text_array[n + 1][1:]
            except IndexError as e:
                print e
    n = n + 1
    # now flatten the 2d array
tokens = [item for sublist in text_array for item in sublist]
tokens = delete_non_greek_tokens(tokens)
for token in tokens:
    fdist.inc(token)

print "most common: ", fdist.max().encode("utf-8")
for item in fdist.keys():
    print item.encode("utf-8"), fdist.freq(item)