def kl_filter(data, verbose=True, progress=False, out=sys.stdout, feature_label_cuttoff=0, feature_count_cuttoff=0, do_label_count=False): """ data = (label, [features ...]) KL is a synonym for Information Gain KL( p(label) || p(label|feature) ) """ (L, F, data) = integerize(data) if do_label_count: label_count = defaultdict(int) for label, features in data: label_count[label] += 1 label_count = label_count.items() label_count.sort(key=lambda x: -x[1]) # sort by count print 'label count' for k,v in label_count: print '%20s => %s' % (k, v) sys.exit(0) K = len(L) M = len(F) if progress: from arsenal.iterextras import iterview else: iterview = lambda x, *a, **kw: x if progress: print >> sys.stderr, '\nTally' # label-feature tally (note: we ignore dulicate features) counts = zeros((K,M)) for y, fv in iterview(data, every=5000): counts[y, fv] += 1 feature_counts = counts.sum(axis=0) if feature_count_cuttoff > 0: cut = feature_counts < feature_count_cuttoff #if verbose: print >> sys.stderr, '%s of %s below cutoff of %s' \ % (cut.sum(), len(feature_counts), feature_count_cuttoff) if progress: print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \ (cut.sum(), M, cut.sum()*100.0/M) # zero-out features below cuttoff counts[:, cut] = 0 if feature_label_cuttoff: cut = counts < feature_label_cuttoff if progress: print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \ (cut.sum(), K*M, cut.sum()*100.0/(K*M)) # zero-out features below cuffoff counts[cut] = 0 label_prior = normalize(counts.sum(axis=1)) # compute KL if progress: print >> sys.stderr, '\nKL' KL = zeros(M) for f in iterview(xrange(M), every=5000): label_given_f = lidstone(counts[:,f], 0.0001) # avoids divide-by-zero KL[f] = -kl_divergence(label_prior, label_given_f) # print KL-feature, most-informative first for i in KL.argsort(): z = counts[:,i].sum() if z == 0: continue p = counts[:,i] * 1.0 / z l = [(v, k) for k,v in zip(L, p) if v > 0] l.sort() z = (-KL[i], F.lookup(i), l) if verbose: print >> out, '%8.6f\t%s\t%s' % (-KL[i], int(counts[:,i].sum()), F.lookup(i)), '\t\033[32m', ' '.join('%s(%s)' % (k,v) for v, k in l), '\033[0m' yield z
def kl_filter(data, verbose=True, progress=False, out=sys.stdout, feature_label_cuttoff=0, feature_count_cuttoff=0, do_label_count=False): """ data = (label, [features ...]) KL is a synonym for Information Gain KL( p(label) || p(label|feature) ) """ (L, F, data) = integerize(data) if do_label_count: label_count = defaultdict(int) for label, features in data: label_count[label] += 1 label_count = label_count.items() label_count.sort(key=lambda x: -x[1]) # sort by count print 'label count' for k, v in label_count: print '%20s => %s' % (k, v) sys.exit(0) K = len(L) M = len(F) if progress: from arsenal.iterextras import iterview else: iterview = lambda x, *a, **kw: x if progress: print >> sys.stderr, '\nTally' # label-feature tally (note: we ignore dulicate features) counts = zeros((K, M)) for y, fv in iterview(data, every=5000): counts[y, fv] += 1 feature_counts = counts.sum(axis=0) if feature_count_cuttoff > 0: cut = feature_counts < feature_count_cuttoff #if verbose: print >> sys.stderr, '%s of %s below cutoff of %s' \ % (cut.sum(), len(feature_counts), feature_count_cuttoff) if progress: print >> sys.stderr, '%s / %s (%.2f%%) features below cuttoff' % \ (cut.sum(), M, cut.sum()*100.0/M) # zero-out features below cuttoff counts[:, cut] = 0 if feature_label_cuttoff: cut = counts < feature_label_cuttoff if progress: print >> sys.stderr, '%s / %s (%.2f%%) feature-label pairs below cuttoff' % \ (cut.sum(), K*M, cut.sum()*100.0/(K*M)) # zero-out features below cuffoff counts[cut] = 0 label_prior = normalize(counts.sum(axis=1)) # compute KL if progress: print >> sys.stderr, '\nKL' KL = zeros(M) for f in iterview(xrange(M), every=5000): label_given_f = lidstone(counts[:, f], 0.0001) # avoids divide-by-zero KL[f] = -kl_divergence(label_prior, label_given_f) # print KL-feature, most-informative first for i in KL.argsort(): z = counts[:, i].sum() if z == 0: continue p = counts[:, i] * 1.0 / z l = [(v, k) for k, v in zip(L, p) if v > 0] l.sort() z = (-KL[i], F.lookup(i), l) if verbose: print >> out, '%8.6f\t%s\t%s' % (-KL[i], int( counts[:, i].sum()), F.lookup(i)), '\t\033[32m', ' '.join( '%s(%s)' % (k, v) for v, k in l), '\033[0m' yield z