def classify(nnuf, X, Y, train_index, test_index): svm_X = [] num_candidates = 0 for i in train_index: nbrs = [] for xi in X[i]: idx, mag, num_c = nnuf.index(xi, detail=True) num_candidates += num_c nbrs.append(idx) svm_X.append(util.bow(nbrs, D_atoms)) svm = LinearSVC() svm.fit(svm_X, Y[train_index]) svm_X = [] for i in test_index: nbrs = [] for xi in X[i]: idx, mag, num_c = nnuf.index(xi, detail=True) num_candidates += num_c nbrs.append(idx) svm_X.append(util.bow(nbrs, D_atoms)) Y_pred = svm.predict(svm_X) return accuracy_score(Y[test_index], Y_pred), num_candidates
storage (e.g. 'mini' or 'half') output_path chunk_size (size in bytes, -1 for no chunks) ''' args = json.loads(sys.argv[1]) storage = name_to_storage(args['storage']) KMeans_tr_size = 200000 X, Y, X_normal = read_dataset(args['tr_folder_path'], args['dtype']) pipe = pscgen.Pipeline(100, 12) pipe.fit(X, Y, args['D_atoms'], args['alpha'], args['beta'], storage) cl1, cl2, cl3 = [], [], [] for i in xrange(len(X)): x = util.bow(pipe.nnu.index(X[i]), args['D_atoms']) cl1.append(pipe.svm.predict(x)[0]) cl2.append(pipe.svm.classes_[classify(x, pipe.svm.coef_, pipe.svm.intercept_, 13)]) cl3.append(pipe.classify(X_normal[i])) print len([i for i, j in zip(cl1, cl3) if i == j]) assert False X_Kmeans = np.vstack(X)[:KMeans_tr_size] # Train D using KMeans D = KMeans(n_clusters=args['D_atoms'], init_size=args['D_atoms']*3) D.fit(X_Kmeans)
#nns print 'Nearest Neighbor' for i, N in enumerate(Ns): D = KMeans(n_clusters=N, init_size=N*3) D.fit(X_tr_Kmeans) D = D.cluster_centers_ D = util.normalize(D) D_mean = np.mean(D, axis=0) D = D - D_mean svm_nns_xs_tr, svm_nns_xs_t = [], [] for x in X_tr: x = util.normalize(x) x = x - D_mean nbrs = np.argmax(np.abs(np.dot(D, x.T)), axis=0) svm_nns_xs_tr.append(util.bow(nbrs, N)) for x in X_t: x = util.normalize(x) x = x - D_mean nbrs = np.argmax(np.abs(np.dot(D, x.T)), axis=0) svm_nns_xs_t.append(util.bow(nbrs, N)) acc = util.predict_chi2(svm_nns_xs_tr, Y_tr, svm_nns_xs_t, Y_t) print N, acc nns_dists.append(acc) D = KMeans(n_clusters=NNU_N, init_size=NNU_N*3) D.fit(X_tr_Kmeans) D = D.cluster_centers_