예제 #1
0
def kl_filter(data,
              verbose=True,
              progress=False,
              out=sys.stdout,
              feature_label_cuttoff=0,
              feature_count_cuttoff=0,
              do_label_count=False):
    """
    data = (label, [features ...])

    KL is a synonym for Information Gain

    KL( p(label) || p(label|feature) )
    """
    (L, F, data) = integerize(data)

    if do_label_count:
        label_count = defaultdict(int)
        for label, features in data:
            label_count[label] += 1
        label_count = label_count.items()
        label_count.sort(key=lambda x: -x[1])  # sort by count
        print 'label count'
        for k,v in label_count:
            print '%20s => %s' % (k, v)
        sys.exit(0)

    K = len(L)
    M = len(F)

    if progress:
        from arsenal.iterextras import iterview
    else:
        iterview = lambda x, *a, **kw: x

    if progress:
        print >> sys.stderr, '\nTally'

    # label-feature tally (note: we ignore dulicate features)
    counts = zeros((K,M))
    for y, fv in iterview(data, every=5000):
        counts[y, fv] += 1

    feature_counts = counts.sum(axis=0)

    if feature_count_cuttoff > 0:
        cut = feature_counts < feature_count_cuttoff

        #if verbose:
        print >> sys.stderr, '%s of %s below cutoff of %s' \
            % (cut.sum(), len(feature_counts), feature_count_cuttoff)

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \
                (cut.sum(), M, cut.sum()*100.0/M)

        # zero-out features below cuttoff
        counts[:, cut] = 0

    if feature_label_cuttoff:
        cut = counts < feature_label_cuttoff

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \
                (cut.sum(), K*M, cut.sum()*100.0/(K*M))

        # zero-out features below cuffoff
        counts[cut] = 0

    label_prior = normalize(counts.sum(axis=1))

    # compute KL
    if progress:
        print >> sys.stderr, '\nKL'

    KL = zeros(M)
    for f in iterview(xrange(M), every=5000):
        label_given_f = lidstone(counts[:,f], 0.0001)   # avoids divide-by-zero
        KL[f] = -kl_divergence(label_prior, label_given_f)

    # print KL-feature, most-informative first
    for i in KL.argsort():

        z = counts[:,i].sum()

        if z == 0:
            continue

        p = counts[:,i] * 1.0 / z

        l = [(v, k) for k,v in zip(L, p) if v > 0]
        l.sort()

        z = (-KL[i], F.lookup(i), l)

        if verbose:
            print >> out, '%8.6f\t%s\t%s' % (-KL[i], int(counts[:,i].sum()), F.lookup(i)), '\t\033[32m', ' '.join('%s(%s)' % (k,v) for v, k in l), '\033[0m'

        yield z
예제 #2
0
def kl_filter(data,
              verbose=True,
              progress=False,
              out=sys.stdout,
              feature_label_cuttoff=0,
              feature_count_cuttoff=0,
              do_label_count=False):
    """
    data = (label, [features ...])

    KL is a synonym for Information Gain

    KL( p(label) || p(label|feature) )
    """
    (L, F, data) = integerize(data)

    if do_label_count:
        label_count = defaultdict(int)
        for label, features in data:
            label_count[label] += 1
        label_count = label_count.items()
        label_count.sort(key=lambda x: -x[1])  # sort by count
        print 'label count'
        for k, v in label_count:
            print '%20s => %s' % (k, v)
        sys.exit(0)

    K = len(L)
    M = len(F)

    if progress:
        from arsenal.iterextras import iterview
    else:
        iterview = lambda x, *a, **kw: x

    if progress:
        print >> sys.stderr, '\nTally'

    # label-feature tally (note: we ignore dulicate features)
    counts = zeros((K, M))
    for y, fv in iterview(data, every=5000):
        counts[y, fv] += 1

    feature_counts = counts.sum(axis=0)

    if feature_count_cuttoff > 0:
        cut = feature_counts < feature_count_cuttoff

        #if verbose:
        print >> sys.stderr, '%s of %s below cutoff of %s' \
            % (cut.sum(), len(feature_counts), feature_count_cuttoff)

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \
                (cut.sum(), M, cut.sum()*100.0/M)

        # zero-out features below cuttoff
        counts[:, cut] = 0

    if feature_label_cuttoff:
        cut = counts < feature_label_cuttoff

        if progress:
            print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \
                (cut.sum(), K*M, cut.sum()*100.0/(K*M))

        # zero-out features below cuffoff
        counts[cut] = 0

    label_prior = normalize(counts.sum(axis=1))

    # compute KL
    if progress:
        print >> sys.stderr, '\nKL'

    KL = zeros(M)
    for f in iterview(xrange(M), every=5000):
        label_given_f = lidstone(counts[:, f], 0.0001)  # avoids divide-by-zero
        KL[f] = -kl_divergence(label_prior, label_given_f)

    # print KL-feature, most-informative first
    for i in KL.argsort():

        z = counts[:, i].sum()

        if z == 0:
            continue

        p = counts[:, i] * 1.0 / z

        l = [(v, k) for k, v in zip(L, p) if v > 0]
        l.sort()

        z = (-KL[i], F.lookup(i), l)

        if verbose:
            print >> out, '%8.6f\t%s\t%s' % (-KL[i], int(
                counts[:, i].sum()), F.lookup(i)), '\t\033[32m', ' '.join(
                    '%s(%s)' % (k, v) for v, k in l), '\033[0m'

        yield z