def test_d2_1_featvec(): label = '1980s' fv = clf_base.make_feature_vector({'test': 1, 'case': 2}, label) eq_(len(fv), 3) eq_(fv[(label, 'test')], 1) eq_(fv[(label, 'case')], 2) eq_(fv[(label, constants.OFFSET)], 1)
def estimate_nb(x, y, smoothing): """ Estimate a naive bayes model :param x: list of dictionaries of base feature counts :param y: list of labels :param smoothing: smoothing constant :returns: weights, as a default dict where the keys are (label, word) tuples and values are smoothed log-probs of P(word|label) :rtype: defaultdict """ labels = set(y) counts = defaultdict(float) doc_counts = defaultdict(float) # Create references to parameters that match formula and algorithm in 4.14 in J&M # Yes, it violates python naming conventions, but understanding Naive Bayes is more important D = x # all documents C = labels # Labels or class as it is known in 4.14 in J&M V = Counter() # Vocabulary counter V_size = 0 # |V| = size of vocabulary, initialize to 0 weights = defaultdict() for c in C: V = preproc.aggregate_counts(D) p_xy = estimate_pxy(D, C, c, smoothing, V) weights.update(clf_base.make_feature_vector(p_xy, c)) # for w, p in p_xy.items(): # key = (c, w) # weights[key] = p # Don't really understand how the OFFSET is related to Naive Bayes, #key = (c, OFFSET) #weights[key] = smoothing return weights