Example #1
0
def test_d2_1_featvec():
    label = '1980s'
    fv = clf_base.make_feature_vector({'test': 1, 'case': 2}, label)
    eq_(len(fv), 3)
    eq_(fv[(label, 'test')], 1)
    eq_(fv[(label, 'case')], 2)
    eq_(fv[(label, constants.OFFSET)], 1)
Example #2
0
def estimate_nb(x, y, smoothing):
    """
    Estimate a naive bayes model

    :param x: list of dictionaries of base feature counts
    :param y: list of labels
    :param smoothing: smoothing constant
    :returns: weights, as a default dict where the keys are (label, word) tuples and values are smoothed log-probs of P(word|label)
    :rtype: defaultdict 

    """

    labels = set(y)
    counts = defaultdict(float)
    doc_counts = defaultdict(float)

    # Create references to parameters that match formula and algorithm in 4.14 in J&M
    # Yes, it violates python naming conventions, but understanding Naive Bayes is more important
    D = x  # all documents
    C = labels  # Labels or class as it is known in 4.14 in J&M
    V = Counter()  # Vocabulary counter
    V_size = 0  # |V| = size of vocabulary, initialize to 0

    weights = defaultdict()
    for c in C:
        V = preproc.aggregate_counts(D)

        p_xy = estimate_pxy(D, C, c, smoothing, V)
        weights.update(clf_base.make_feature_vector(p_xy, c))

        # for w, p in p_xy.items():
        #     key = (c, w)
        #     weights[key] = p

        # Don't really understand how the OFFSET is related to Naive Bayes,
        #key = (c, OFFSET)
        #weights[key] = smoothing

    return weights