def draw_plot(metric): metric_coefs = dump.load_object(metric + '/svm/coefs.dump') metric_f1 = dump.load_object(metric + '/svm/f1.dump') metric_n_feat = dump.load_object(metric + '/svm/feat.dump') pt.title(metric + ': F1') pt.plot(metric_coefs, metric_f1) pt.plot(metric_coefs, [score] * len(metric_coefs), color='red') pt.figure() pt.title(metric + ': N Features') pt.plot(metric_coefs, metric_n_feat) pt.show()
def information_gain(data, labels, dumped=False): import numpy as np def get_features(data_set): n = len(data[0]) return [[i[j] for i in data_set] for j in range(n)] def possibilities(feature): counts = np.bincount(feature) return np.asarray(counts[np.nonzero(counts)] / float(len(feature))) def entropy(feature): p = possibilities(feature) return -np.sum(p * np.log2(p)) def spec_cond_entropy(x, y, xi): new_y = [y[i] for i in range(len(y)) if x[i] == xi] return entropy(new_y) def cond_entropy(x, y): p = possibilities(x) return sum([p[xi] * spec_cond_entropy(x, y, xi) for xi in range(len(p))]) def cond_entropy_full(x, y): from util.frame import progress print('Information gain: computing conditional entropy:') feat_len = len(x) result = [] for i in range(feat_len): result.append(cond_entropy(x[i], y)) if i % 10 == 0: progress((i + 1) / feat_len) progress(1) return np.asarray(result) import util.dump as dump features = get_features(data) h_y_x = [] if not dumped: h_y_x = cond_entropy_full(features, labels) dump.dump_object(h_y_x, 'ig/hyx.dump') else: h_y_x = dump.load_object('ig/hyx.dump') info_gain = entropy(labels) - h_y_x result = [(info_gain[i], i) for i in range(len(info_gain))] return result
def pearson(data, labels, dumped=False): import numpy as np import util.dump as dump import math import warnings import scipy.stats as stats warnings.filterwarnings('ignore') def get_features(data_set): n = len(data[0]) return [[i[j] for i in data_set] for j in range(n)] def feature_correlation(x, y): n = range(len(x)) x_avg = sum(x) / len(x) y_avg = sum(y) / len(y) cov = sum([(x[i] - x_avg) * (y[i] - y_avg) for i in n]) x_dev = math.sqrt(sum([(x[i] - x_avg) ** 2 for i in n])) y_dev = math.sqrt(sum([(y[i] - y_avg) ** 2 for i in n])) return cov / (x_dev * y_dev) def correlation(x, y): from util.frame import progress print('Pearson: computing corellation coefficients:') feat_len = len(x) result = [] for i in range(feat_len): result.append(feature_correlation(x[i], y)) if i % 10 == 0: progress((i + 1) / feat_len) progress(1) return np.asarray(result) features = get_features(data) ro = [] if not dumped: ro = correlation(features, labels) dump.dump_object(ro, 'pearson/ro.dump') else: ro = dump.load_object('pearson/ro.dump') v = len(labels) - 2 p = [] for i in range(len(ro)): t = ro[i] * math.sqrt(v) / math.sqrt(1 - ro[i] ** 2) p.append((stats.t.sf(np.abs(t), v) * 2, i)) return p
def spearman(data, labels, dumped=False): import numpy as np import util.dump as dump import math import warnings import scipy.stats as stats warnings.filterwarnings('ignore') def get_features(data_set): n = len(data[0]) return [[i[j] for i in data_set] for j in range(n)] def feature_correlation(x, y): n = len(x) rank_x = np.asarray(stats.rankdata(x, method='max')) rank_y = np.asarray(stats.rankdata(y, method='max')) sum_d_2 = sum((rank_x - rank_y) ** 2) return 1 - 6 * sum_d_2 / (n * (n ** 2 - 1)) def correlation(x, y): from util.frame import progress print('Spearman: computing corellation coefficients:') feat_len = len(x) result = [] for i in range(feat_len): result.append(feature_correlation(x[i], y)) if i % 10 == 0: progress((i + 1) / feat_len) progress(1) return np.asarray(result) features = get_features(data) ro = [] if not dumped: ro = correlation(features, labels) dump.dump_object(ro, 'spearman/ro.dump') else: ro = dump.load_object('spearman/ro.dump') n = len(labels) v = n - 2 p = [] for i in range(len(ro)): t = ro[i] * math.sqrt(v) / math.sqrt(1 - ro[i] ** 2) p.append((stats.t.sf(np.abs(t), v) * 2, i)) return p
import util.dump as dump import matplotlib.pyplot as pt INFO_GAIN = True PEARSON = True SPEARMAN = True IG_NBEST = True PEARSON_NBEST = True SPEARMAN_NBEST = True VENN = True VENN_NBEST = True score = dump.load_object('score.dump') def draw_plot(metric): metric_coefs = dump.load_object(metric + '/svm/coefs.dump') metric_f1 = dump.load_object(metric + '/svm/f1.dump') metric_n_feat = dump.load_object(metric + '/svm/feat.dump') pt.title(metric + ': F1') pt.plot(metric_coefs, metric_f1) pt.plot(metric_coefs, [score] * len(metric_coefs), color='red') pt.figure() pt.title(metric + ': N Features') pt.plot(metric_coefs, metric_n_feat) pt.show()
def run_classifier(train_data, train_labels, test_data, classifier): if train_data.shape[1] == 0: return np.asarray([0] * train_data.shape[0]) classifier.fit(train_data, train_labels) return classifier.predict(test_data) def classify(x, x_val, y): import sklearn.svm as svm predict = run_classifier(x, y, x_val, svm.LinearSVC()) return predict data = dump.load_object('data.dump') data_valid = dump.load_object('data_valid.dump') labels = dump.load_object('labels.dump') labels_valid = dump.load_object('labels_valid.dump') score = metrics.f1_score(labels_valid, classify(data, data_valid, labels)) print(score) print() dump.dump_object(score, 'score.dump') # INFO GAIN if INFO_GAIN: ig = dump.load_object('ig/ig.dump') ig_coefs = np.arange(0.1, 0.91, 0.01) ig_f1 = [] ig_n_feat = []
def labels_to_np_array(labels_data): x = np.zeros((len(labels_data), 10)) for i in range(len(labels_data)): x[i][labels_data[i]] = 1 return x def get_predicted(predict_data): return [max(range(len(i)), key=lambda x: i[x]) for i in predict_data] stats_x, stats_y, stats_y2, stats_y3 = [], [], [], [] if CONTINUE or DUMPED: stats_x, stats_y = load_object('stoch-n-images-stat.dump') if not DUMPED or (DUMPED and CONTINUE): train_labels = [] train_images = [] image_size = (28, 28) timer = Timer() stdout.write('Loading Train data...') timer.set_new() train_labels = reader.read_labels('mnist/train-labels-idx1-ubyte') train_images = reader.read_images('mnist/train-images-idx3-ubyte') print('DONE in ' + timer.get_diff_str()) image_size = train_images[1] stdout.write('Loading Test data...') timer.set_new() test_labels = reader.read_labels('mnist/t10k-labels-idx1-ubyte')
print('DONE in ' + timer.get_diff_str()) stdout.write('Loading Test data...') timer.set_new() test_labels_file = reader.read_labels('mnist/t10k-labels-idx1-ubyte') test_images_file = reader.read_images('mnist/t10k-images-idx3-ubyte') test_data = images_to_np_array(test_images_file[2]) test_labels = np.asarray(test_labels_file[1]) print('DONE in ' + timer.get_diff_str()) # timer.set_new() # coef = information_gain(train_data, train_labels) # print(' DONE in ' + timer.get_diff_str()) # dump_object(coef, 'spearman.dump') import pylab as pt ig = [x[1] for x in sorted(load_object('ig.dump'))] y = np.zeros((28, 28, 3)) n = 100 features = ig[-n:] for i in features: y[i // 28][i % 28] = [1, 1, 1] pt.imshow(y) pt.show() fs_data = train_data.T[features].T fs_labels = train_labels fs_test_data = test_data.T[features].T fs_test_labels = test_labels
def labels_to_np_array(labels_data): x = np.zeros((len(labels_data), 10)) for i in range(len(labels_data)): x[i][labels_data[i]] = 1 return x def get_predicted(predict_data): return [max(range(len(i)), key=lambda x: i[x]) for i in predict_data] stats_x, stats_y, stats_y2, stats_y3 = [], [], [], [] if CONTINUE or DUMPED: stats_x, stats_y = load_object('stoch-hidden-stat.dump') if not DUMPED or (DUMPED and CONTINUE): train_labels = [] train_images = [] image_size = (28, 28) timer = Timer() stdout.write('Loading Train data...') timer.set_new() train_labels = reader.read_labels('mnist/train-labels-idx1-ubyte') train_images = reader.read_images('mnist/train-images-idx3-ubyte') print('DONE in ' + timer.get_diff_str()) image_size = train_images[1] stdout.write('Loading Test data...') timer.set_new() test_labels = reader.read_labels('mnist/t10k-labels-idx1-ubyte')
print('DONE in ' + timer.get_diff_str()) image_size = test_images[1] images_test = images_to_np_array(test_images[2]) labels_test = labels_to_np_array(test_labels[1]) rang_test = len(images_test) def classify(): predicted = network.predict(images_test) predicted = get_predicted(predicted) return f1_score(test_labels[1], predicted) network = NeuralNetwork(1, 1, 1) if NETWORK_DUMPED: network = load_object('network.dump') print(classify()) else: images_train = images_to_np_array(train_images[2]) labels_train = labels_to_np_array(train_labels[1]) stats = [] if NETWORK_CONTINUE: network = load_object('network.dump') stats = load_object('stats.dump') else: network = NeuralNetwork(image_size[0] * image_size[1], 10, 10) rang_train = len(images_train) print('Training...') cycles = 0 timer = Timer() progress(0)
def classify(): predicted = network.predict(images_test) predicted = get_predicted(predicted) return f1_score(test_labels[1], predicted) def classify_print(): predicted = network.predict(images_test) predicted = get_predicted(predicted) print(classification_report(test_labels[1], predicted)) network = NeuralNetwork(1, 1, 1) if NETWORK_DUMPED: network = load_object('stoch-network.dump') print(classify_print()) else: images_train = images_to_np_array(train_images[2]) labels_train = labels_to_np_array(train_labels[1]) stats = [] if NETWORK_CONTINUE: network = load_object('stoch-network.dump') stats = load_object('stoch-stats.dump') else: network = NeuralNetwork(image_size[0] * image_size[1], 300, 10, layers=1) rang_train = len(images_train) print('Training...') cycles = 100 num = 240