def feature_selection_trials(): """ Select top k features. Vary k and plot data """ global pos, neg, totals, features retrain = True if not retrain and os.path.isfile(FDATA_FILE): pos, neg, totals = cPickle.load(open(FDATA_FILE)) return words = list(set(pos.keys() + neg.keys())) print "Total no of features:", len(words) words.sort(key=lambda w: -MI(w)) num_features, accuracy = [], [] bestk = 0 limit = 500 path = "./aclImdb/test/" step = 500 start = 20000 best_accuracy = 0.0 for w in words[:start]: features.add(w) for k in xrange(start, 40000, step): for w in words[k:k+step]: features.add(w) correct = 0 size = 0 for file in os.listdir(path + "pos")[:limit]: correct += classify(open(path + "pos/" + file).read()) == True size += 1 for file in os.listdir(path + "neg")[:limit]: correct += classify(open(path + "neg/" + file).read()) == False size += 1 num_features.append(k+step) accuracy.append(correct / size) if (correct / size) > best_accuracy: bestk = k print k+step, correct / size features = set(words[:bestk]) cPickle.dump(get_relevant_features(), open(FDATA_FILE, 'w')) pylab.plot(num_features, accuracy) pylab.show()
def feature_selection_experiment(test_set): """ Select top k features. Vary k from 1000 to 50000 and plot data """ keys = positive.keys() + negative.keys() sorted_keys = sorted(keys, cmp=lambda x, y: mutual_info(x) > mutual_info(y)) # Sort descending by mutual info features = set() num_features, accuracy = [], [] print sorted_keys[-100:] for k in xrange(0, 50000, 1000): features |= set(sorted_keys[k:k+1000]) preprocessor = partial(reduce_features, features) correct = 0 for text, label in test_set: correct += classify(text) == label num_features.append(k+1000) accuracy.append(correct / len(test_set)) print negate_sequence("Is this a good idea") print reduce_features(features, "Is this a good idea") pylab.plot(num_features, accuracy) pylab.show()