def repeated_item_probability(self, item, category):
     item_feature_count = Counter(get_words(item))
     p = 1.
     for f, c in item_feature_count.iteritems():
         # this doesn't make sense unless you're trying to compare the probabilities for numerouse vs single occurences of a feature
         p *= pow(self.weighted_feature_probability(f, category), c)
     return p
Exemple #2
0
 def repeated_item_probability(self, item, category):
     item_feature_count = Counter(get_words(item))
     p = 1.
     for f, c in item_feature_count.iteritems():
         # this doesn't make sense unless you're trying to compare the probabilities for numerouse vs single occurences of a feature
         p *= pow(self.weighted_feature_probability(f, category), c)
     return p
Exemple #3
0
 def append_document(self, doc, wordlist=None):
     """Add a column to the occurence matrix representing the word counts in a single document"""
     if isinstance(doc, basestring):
         doc = Counter(get_words(doc))
     if not isinstance(doc, Mapping) and len(doc) <= len(wordlist):
         doc = dict(zip(doc, wordlist[:len(doc)]))
     iptr = self.matrix.indptr.tolist()
     num_docs = self.matrix.shape[1]
     self.matrix = csr_matrix((self.matrix.data, self.matrix.indices, iptr + [iptr[-1]]),
                              shape=(len(self.words), num_docs + 1), dtype='int64')
     for word, count in doc.iteritems():
         self.matrix[self.word_index(word), num_docs] = count
Exemple #4
0
 def append_document(self, doc, wordlist=None):
     """Add a column to the occurence matrix representing the word counts in a single document"""
     if isinstance(doc, basestring):
         doc = Counter(get_words(doc))
     if not isinstance(doc, Mapping) and len(doc) <= len(wordlist):
         doc = dict(zip(doc, wordlist[:len(doc)]))
     iptr = self.matrix.indptr.tolist()
     num_docs = self.matrix.shape[1]
     self.matrix = csr_matrix(
         (self.matrix.data, self.matrix.indices, iptr + [iptr[-1]]),
         shape=(len(self.words), num_docs + 1),
         dtype='int64')
     for word, count in doc.iteritems():
         self.matrix[self.word_index(word), num_docs] = count
#!/usr/bin/env python

from collections import defaultdict, Counter

# count[0] = 1-gram counts
# count[1] = 2-gram counts
count = defaultdict(int)
fish_count = Counter({'carp': 10, 'eel': 1, 'perch': 3, 'trout': 1, 'salmon': 1, 'perch': 3, 'whitefish': 2})
fish = list(fish_count)
count.update(fish)


from pug.nlp.util import get_words

s = ' '.join(fish)
count = [Counter(get_words(s))]

class Classifier:
    def __init__(self, get_features=get_words, s='', path=''):
        self.get_features = get_features
        # TODO: use a defaultdict everywhere a dict is used
        self.feature_count = Counter()
        self.category_count = defaultdict(Counter)  # dict of Counters, count of documents binned in each category 
        self.num_items = 0

    def increment_feature(self, feature, category):
        "Increase the count for a feature<->category association"
        self.feature_count[feature] += Counter((feature,))
        # self.feature_count.setdefault(feature, Counter())
        # self.feature_count[feature].setdefault(category, 0)
        # self.feature_count[feature][category] += 1
Exemple #6
0
fish_count = Counter({
    'carp': 10,
    'eel': 1,
    'perch': 3,
    'trout': 1,
    'salmon': 1,
    'perch': 3,
    'whitefish': 2
})
fish = list(fish_count)
count.update(fish)

from pug.nlp.util import get_words

s = ' '.join(fish)
count = [Counter(get_words(s))]


class Classifier:
    def __init__(self, get_features=get_words, s='', path=''):
        self.get_features = get_features
        # TODO: use a defaultdict everywhere a dict is used
        self.feature_count = Counter()
        self.category_count = defaultdict(
            Counter
        )  # dict of Counters, count of documents binned in each category
        self.num_items = 0

    def increment_feature(self, feature, category):
        "Increase the count for a feature<->category association"
        self.feature_count[feature] += Counter((feature, ))
 def item_probability(self, item, category):
     item_features = set(get_words(item))
     p = 1.
     for f in item_features:
         p *= self.weighted_feature_probability(f, category)
     return p
Exemple #8
0
 def item_probability(self, item, category):
     item_features = set(get_words(item))
     p = 1.
     for f in item_features:
         p *= self.weighted_feature_probability(f, category)
     return p