def repeated_item_probability(self, item, category): item_feature_count = Counter(get_words(item)) p = 1. for f, c in item_feature_count.iteritems(): # this doesn't make sense unless you're trying to compare the probabilities for numerouse vs single occurences of a feature p *= pow(self.weighted_feature_probability(f, category), c) return p
def append_document(self, doc, wordlist=None): """Add a column to the occurence matrix representing the word counts in a single document""" if isinstance(doc, basestring): doc = Counter(get_words(doc)) if not isinstance(doc, Mapping) and len(doc) <= len(wordlist): doc = dict(zip(doc, wordlist[:len(doc)])) iptr = self.matrix.indptr.tolist() num_docs = self.matrix.shape[1] self.matrix = csr_matrix((self.matrix.data, self.matrix.indices, iptr + [iptr[-1]]), shape=(len(self.words), num_docs + 1), dtype='int64') for word, count in doc.iteritems(): self.matrix[self.word_index(word), num_docs] = count
def append_document(self, doc, wordlist=None): """Add a column to the occurence matrix representing the word counts in a single document""" if isinstance(doc, basestring): doc = Counter(get_words(doc)) if not isinstance(doc, Mapping) and len(doc) <= len(wordlist): doc = dict(zip(doc, wordlist[:len(doc)])) iptr = self.matrix.indptr.tolist() num_docs = self.matrix.shape[1] self.matrix = csr_matrix( (self.matrix.data, self.matrix.indices, iptr + [iptr[-1]]), shape=(len(self.words), num_docs + 1), dtype='int64') for word, count in doc.iteritems(): self.matrix[self.word_index(word), num_docs] = count
#!/usr/bin/env python from collections import defaultdict, Counter # count[0] = 1-gram counts # count[1] = 2-gram counts count = defaultdict(int) fish_count = Counter({'carp': 10, 'eel': 1, 'perch': 3, 'trout': 1, 'salmon': 1, 'perch': 3, 'whitefish': 2}) fish = list(fish_count) count.update(fish) from pug.nlp.util import get_words s = ' '.join(fish) count = [Counter(get_words(s))] class Classifier: def __init__(self, get_features=get_words, s='', path=''): self.get_features = get_features # TODO: use a defaultdict everywhere a dict is used self.feature_count = Counter() self.category_count = defaultdict(Counter) # dict of Counters, count of documents binned in each category self.num_items = 0 def increment_feature(self, feature, category): "Increase the count for a feature<->category association" self.feature_count[feature] += Counter((feature,)) # self.feature_count.setdefault(feature, Counter()) # self.feature_count[feature].setdefault(category, 0) # self.feature_count[feature][category] += 1
fish_count = Counter({ 'carp': 10, 'eel': 1, 'perch': 3, 'trout': 1, 'salmon': 1, 'perch': 3, 'whitefish': 2 }) fish = list(fish_count) count.update(fish) from pug.nlp.util import get_words s = ' '.join(fish) count = [Counter(get_words(s))] class Classifier: def __init__(self, get_features=get_words, s='', path=''): self.get_features = get_features # TODO: use a defaultdict everywhere a dict is used self.feature_count = Counter() self.category_count = defaultdict( Counter ) # dict of Counters, count of documents binned in each category self.num_items = 0 def increment_feature(self, feature, category): "Increase the count for a feature<->category association" self.feature_count[feature] += Counter((feature, ))
def item_probability(self, item, category): item_features = set(get_words(item)) p = 1. for f in item_features: p *= self.weighted_feature_probability(f, category) return p