def randproj(tx, ty, rx, ry): compressor = RandomProjection(tx[1].size) newtx = compressor.fit_transform(tx) compressor = RandomProjection(tx[1].size) newrx = compressor.fit_transform(rx) em(newtx, ty, newrx, ry, add="wRPtr", times=10) km(newtx, ty, newrx, ry, add="wRPtr", times=10) nn(newtx, ty, newrx, ry, add="wRPtr")
def best_rp_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) X_train_transformed = rp.fit_transform(X_train_scl, y_train) X_test_transformed = rp.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/nba_rp_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def project_points(points, dim=None): if dim is None: dim = 5 #dim = min(max(int(np.log(len(points))), 2), 15) proj = GaussianRandomProjection(n_components=dim) return proj.fit_transform(points)
def test_fixed_state_transformer(): random_state = check_random_state(0) X = random_state.rand(500, 100) # Check that setting the random_seed is equivalent to set the # random_state transf = GaussianRandomProjection(n_components=5, random_state=0) fixed_transf = FixedStateTransformer(GaussianRandomProjection(n_components=5), random_seed=0) assert_array_almost_equal(fixed_transf.fit_transform(X), transf.fit_transform(X)) # Check that set_params doesn't modify the results fixed_transf = FixedStateTransformer(GaussianRandomProjection(n_components=5, random_state=None)) fixed_transf2 = FixedStateTransformer(GaussianRandomProjection(random_state=1, n_components=5)) assert_array_almost_equal(fixed_transf.fit_transform(X), fixed_transf2.fit_transform(X)) # Check that it work when there is no random_state fixed_transf = FixedStateTransformer(IdentityProjection()) assert_array_almost_equal(fixed_transf.fit_transform(X), X)
X.reset_index(inplace=True, drop=True) Y.reset_index(inplace=True, drop=True) # Handling outliers # Y[Y > 150] = Y.quantile(0.99) pca = PCA(n_components=5) ica = FastICA(n_components=5, max_iter=1000) tsvd = TruncatedSVD(n_components=5) gp = GaussianRandomProjection(n_components=5) sp = SparseRandomProjection(n_components=5, dense_output=True) x_pca = pd.DataFrame(pca.fit_transform(X)) x_ica = pd.DataFrame(ica.fit_transform(X)) x_tsvd = pd.DataFrame(tsvd.fit_transform(X)) x_gp = pd.DataFrame(gp.fit_transform(X)) x_sp = pd.DataFrame(sp.fit_transform(X)) x_pca.columns = ["pca_{}".format(i) for i in x_pca.columns] x_ica.columns = ["ica_{}".format(i) for i in x_ica.columns] x_tsvd.columns = ["tsvd_{}".format(i) for i in x_tsvd.columns] x_gp.columns = ["gp_{}".format(i) for i in x_gp.columns] x_sp.columns = ["sp_{}".format(i) for i in x_sp.columns] X = pd.concat((X, x_pca), axis=1) X = pd.concat((X, x_ica), axis=1) X = pd.concat((X, x_tsvd), axis=1) X = pd.concat((X, x_gp), axis=1) X = pd.concat((X, x_sp), axis=1) x_test_pca = pd.DataFrame(pca.transform(X_Test))
def em_rp(X, y, n, dataset): print("---- EM + RP ----") rp = GaussianRandomProjection(n_components=n) X_new = rp.fit_transform(X) em_silhouette = [] vmeasure_score = [] adjusted_rand = [] mutual_in_score = [] homogenity = [] completeness = [] list_k = list(range(2, 15)) start = time.time() for i in list_k: i_start = time.time() print("CLUSTER :", i) em = GaussianMixture(n_components=i, n_init=10, max_iter=500, random_state=0).fit(X_new) preds = em.predict(X_new) silhouette = silhouette_score(X_new, preds) em_silhouette.append(silhouette) print("Silhouette score : {}".format(silhouette)) ad_rand = adjusted_rand_score(y, preds) adjusted_rand.append(ad_rand) print("Adjusted random score : {}".format(ad_rand)) mutual_info = mutual_info_score(y, preds) mutual_in_score.append(mutual_info) print("Adjusted mutual info score : {}".format(mutual_info)) h**o = homogeneity_score(y, preds) homogenity.append(h**o) print("Homogeneity score: {}".format(h**o)) comp = completeness_score(y, preds) completeness.append(comp) print("Completeness score : {}".format(comp)) v_measure = v_measure_score(y, preds) vmeasure_score.append(v_measure) print("V-measure score : {}".format(v_measure)) print("BIC : {}".format(em.bic(X_new))) print("Log-likelihood score : {}".format(em.score(X_new))) i_end = time.time() print("Time for this iteration :", (i_end - i_start)) print("-" * 100) end = time.time() print("TOTAL TIME", (end - start)) plt.style.use('seaborn') plt.title('EM Clustering on RP', fontsize=16, y=1.03) plt.plot(list_k, em_silhouette, '-o', label='Silhouette score') plt.plot(list_k, adjusted_rand, '-o', label='Adjusted Random score') plt.plot(list_k, mutual_in_score, '-o', label='Mutual Info score') plt.plot(list_k, homogenity, '-o', label='Homogenity score') plt.plot(list_k, completeness, '-o', label='Completeness score') plt.plot(list_k, vmeasure_score, '-o', label='V-measure score') plt.xlabel('Number of clusters') plt.ylabel('Metrics score') plt.legend() filename = 'EM_RP_' + dataset + '.png' plt.savefig(filename) plt.clf()
file_2.write("ICA_kurt2") for i in range(0, len(kurt2)): file_2.write(";") file_2.write("%1.9f" % kurt2[i]) file_2.write("\n") ############################## RP ############################## grp = GaussianRandomProjection(random_state=5) error_rate_1 = np.zeros(np.shape(data1_X)[1]) for i in range(0, np.shape(data1_X)[1]): grp.set_params(n_components=i + 1) DT1 = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=0.005) error_rate_1[i] = sum( DT1.fit(grp.fit_transform(data1_X), data1_Y).predict( grp.fit_transform(data1_X)) == data1_Y) * 1.0 / n1 print i + 1 i1 = np.argmax(error_rate_1) + 1 grp.set_params(n_components=i1) recon1 = range(0, 2) #pairwiseDistCorr(grp.fit_transform(data1_X), data1_X) error_rate_2 = np.zeros(np.shape(data2_X)[1]) for i in range(0, np.shape(data2_X)[1]): grp.set_params(n_components=i + 1) DT2 = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=0.005) error_rate_2[i] = sum( DT2.fit(grp.fit_transform(data2_X), data2_Y).predict( grp.fit_transform(data2_X)) == data2_Y) * 1.0 / n2
import numpy as np from sklearn.random_projection import GaussianRandomProjection from sklearn import datasets, svm random_state = np.random.RandomState(0) X = random_state.randn(10, 10000) print(X.dtype) X = np.array(X, dtype='float32') print(X.dtype) transformer = GaussianRandomProjection() X_new = transformer.fit_transform(X) print(X_new.dtype) print(X.shape, X_new.shape) iris = datasets.load_iris() clf = svm.SVC(gamma='auto') clf.fit(iris.data, iris.target) print(clf.predict(iris.data[:3])) clf.fit(iris.data, iris.target_names[iris.target]) print(clf.predict(iris.data[:3]))
markersize=3) plt.xticks(()) plt.yticks(()) plt.title('Labels mapped on the ICA-reduced 2D graph') fig.savefig('figures/gender_em_ICA_rankings.png') plt.close(fig) # # ########################################################################## print("RP - kmeans") for n in nrange: transformer = GaussianRandomProjection(n_components=n) reduced_data = transformer.fit_transform(df_x) print("N:", n) kmeans = KMeans(init='k-means++', n_clusters=10, n_init=10) kmeans.fit(reduced_data) correct = 0 for i in range(10): d = defaultdict(int) for index, row in df.iterrows(): if row[label] == float(i): lab = kmeans.predict([reduced_data[index]]) d[lab[0]] += 1 if d: correct += max(d.values())
def random(X, K): grp = GaussianRandomProjection(n_components=K) X_red = grp.fit_transform(X) X_red = normalizer.fit_transform(X_red) return X_red
alpha = 20. epsilon1 = alpha / (noise_std * noise_std) + np.log(1 / delta) / (alpha - 1) print(epsilon1) gaussian_graph_noise = np.random.normal(0, noise_std, citeseer_graph_copy2.shape) citeseer_graph_copy2 += gaussian_graph_noise graph_proj_2 = clf_proj.fit_transform(citeseer_graph_copy2) ##### baseline 4: gradient descent + gradient perturbation ##### the proposed method: random projection + add Gaussian noise to the graph adjacency matrix directly #d = 10 d = 30 fraction = 8. rand_proj = GaussianRandomProjection(n_components=d) graph_randn_proj = rand_proj.fit_transform(citeseer_graph) noise_std = fraction * np.sqrt(1 / d) graph_randn_proj += np.random.normal(0.0, noise_std, size=graph_randn_proj.shape) quad_base, r = qr(graph_randn_proj) #graph_randn_svd = quad_base[:,:3*no_labels] graph_randn_svd = graph_randn_proj #graph_randn_svd = singlepass_evd(citeseer_graph,d) #alpha = 2.5 #epsilon_renyi = np.max([2*(d/2*np.log((3+fraction)/(2+fraction)) + d/(2*(alpha-1))*np.log((3+fraction)/(alpha*(3+fraction) - (alpha-1)*(2+fraction)))),2*(d/2*np.log((2+fraction)/(3+fraction)) + d/(2*(alpha-1))*np.log((2+fraction)/(alpha*(2+fraction) - (alpha-1)*(3+fraction))))]) #epsilon1 = epsilon_renyi + np.log(1/delta)/(alpha-1) #print(epsilon1) ''' d = 10
X_ica = X_r plt.figure() colors = ["b", "g", "r", "c", "m", "y", "k"] lw = 2 for color, i in zip(colors, [4, 8]): plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw, label=i) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title('ICA of Wine Quality dataset') ################################################# # Random Projection feature transformation rca = GaussianRandomProjection(n_components=11, random_state=10) X_r = rca.fit_transform(X) X_rca = X_r plt.figure() colors = ["b", "g", "r", "c", "m", "y", "k"] lw = 2 for color, i in zip(colors, [4, 8]): plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw, label=i) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title('Random Projection of Wine Quality dataset') ################################################# # Univariate feature selection (K best) from sklearn.feature_selection import chi2
def get_rp(data, tdata, num_classes): rp = GaussianRandomProjection(n_components=num_classes) pdata = rp.fit_transform(data) ptdata = rp.fit_transform(tdata) return pdata, ptdata
def run_random_projection(data, num_components): clf = GaussianRandomProjection(n_components=num_components, random_state=seed) return clf.fit_transform(data)
((maxAccuracy * 100), treesNo)) # # Random Projection # In[6]: from sklearn.ensemble import RandomForestClassifier n_comp = 12 # GaussianRandomProjection from sklearn.random_projection import GaussianRandomProjection grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(df_train) grp_results_test = grp.transform(testData) treesNo = 141 model = RandomForestClassifier(n_estimators=treesNo) model.fit(grp_results_train, y_train) score = model.score(grp_results_test, y_test) print("GaussianRandomProjection with random forest accuracy = %.2f%%" % (score * 100)) # SparseRandomProjection from sklearn.random_projection import SparseRandomProjection srp = SparseRandomProjection(n_components=n_comp, dense_output=True,
def nn2(xs, ys, xs_test, ys_test, n_components, clf_constructor): ks = [0 for _ in range(10)] cataccs = [0 for _ in range(10)] ys = [to_categorical(ys[0]), to_categorical(ys[1])] ys_test = [to_categorical(ys_test[0]), to_categorical(ys_test[1])] for i in range(2): shape = np.shape(xs[i])[1] n_components[i] = shape model = utils.create_adult_model( shape, 2) if i == 0 else utils.create_wine_model(shape, 5) model.fit(xs[i][:10000], ys[i][:10000], batch_size=50, epochs=10, verbose=False) cataccs[i] = model.evaluate(xs_test[i], ys_test[i], verbose=False)[1] * 100 for k in range(2, 11): try: clf = clf_constructor(n_clusters=k) except: clf = clf_constructor(n_components=k) for i in range(2): pca = PCA(n_components=n_components[2 + i]) transformed = pca.fit_transform(xs[i]) transformed_test = pca.transform(xs_test[i]) predict = to_categorical(clf.fit_predict(transformed[:10000])) predict_test = to_categorical(clf.predict( transformed_test[:10000])) input_dims = [n_components[2 + i], k] model = utils.create_mi_adult_model( input_dims, 2) if i == 0 else utils.create_mi_wine_model( input_dims, 5) model.fit([transformed[:10000], predict], ys[i][:10000], batch_size=50, epochs=10, verbose=False) catacc = model.evaluate([transformed_test, predict_test], ys_test[i], verbose=False)[1] * 100 if catacc > cataccs[2 + i]: ks[2 + i] = k cataccs[2 + i] = catacc ica = FastICA(n_components=n_components[4 + i]) transformed = ica.fit_transform(xs[i]) transformed_test = ica.transform(xs_test[i]) predict = to_categorical(clf.fit_predict(transformed[:10000])) predict_test = to_categorical(clf.predict( transformed_test[:10000])) input_dims = [n_components[4 + i], k] model = utils.create_mi_adult_model( input_dims, 2) if i == 0 else utils.create_mi_wine_model( input_dims, 5) model.fit([transformed[:10000], predict], ys[i][:10000], batch_size=50, epochs=10, verbose=False) catacc = model.evaluate([transformed_test, predict_test], ys_test[i], verbose=False)[1] * 100 if catacc > cataccs[4 + i]: ks[4 + i] = k cataccs[4 + i] = catacc if i == 1: rp = GaussianRandomProjection(eps=0.95) transformed = rp.fit_transform(xs[i]) transformed_test = rp.transform(xs_test[i]) predict = to_categorical(clf.fit_predict(transformed[:10000])) predict_test = to_categorical( clf.predict(transformed_test[:10000])) input_dims = [np.shape(transformed)[1], k] model = utils.create_mi_wine_model(input_dims, 5) model.fit([transformed[:10000], predict], ys[i][:10000], batch_size=50, epochs=10, verbose=False) catacc = model.evaluate([transformed_test, predict_test], ys_test[i], verbose=False)[1] * 100 if catacc > cataccs[6 + i]: ks[6 + i] = k cataccs[6 + i] = catacc encoder, vae = utils.create_vae( np.shape(xs[i])[1], n_components[8 + i]) vae.fit(xs[i], batch_size=50, epochs=10, verbose=False) transformed = encoder.predict(xs[i], verbose=False) transformed_test = encoder.predict(xs_test[i], verbose=False) predict = to_categorical(clf.fit_predict(transformed[:10000])) predict_test = to_categorical(clf.predict( transformed_test[:10000])) input_dims = [n_components[8 + i], k] model = utils.create_mi_adult_model( input_dims, 2) if i == 0 else utils.create_mi_wine_model( input_dims, 5) model.fit([transformed[:10000], predict], ys[i][:10000], batch_size=50, epochs=10, verbose=False) catacc = model.evaluate([transformed_test, predict_test], ys_test[i], verbose=False)[1] * 100 if catacc > cataccs[8 + i]: ks[8 + i] = k cataccs[8 + i] = catacc plot.style.use('seaborn-darkgrid') plot.title(f'Influence of feature transformation on the NN accuracy') color = [] for _ in range(5): color.append('tab:blue') color.append('tab:orange') x = [] count = 1 for _ in range(5): x.append(count) count += 0.5 x.append(count) count += 1 plot.bar(x, cataccs, color=color, width=0.75) x = [] count = 1.25 for _ in range(5): x.append(count) count += 1.5 plot.xticks(x, ['None', 'PCA', 'ICA', 'RP', 'VAE']) plot.xlabel('Feature transformation method') plot.ylabel('Categorical accuracy (%)') plot.show()
def nn_benchmark(xs, ys, n_components): ys = [to_categorical(ys[0]), to_categorical(ys[1])] none_samples = [[], []] pca_samples = [[], []] ica_samples = [[], []] rp_samples = [[], []] vae_samples = [[], []] trials = 7 for _ in range(trials): for i in range(2): shape = np.shape(xs[i])[1] n_components[i] = shape model = utils.create_adult_model( shape, 2) if i == 0 else utils.create_wine_model(shape, 5) start = time.time() model.fit(xs[i][:10000], ys[i][:10000], batch_size=50, epochs=10, verbose=False) none_samples[i].append(time.time() - start) for i in range(2): dim = n_components[2 + i] pca = PCA(n_components=dim) transformed = pca.fit_transform(xs[i]) model = utils.create_adult_model( dim, 2) if i == 0 else utils.create_wine_model(dim, 5) start = time.time() model.fit(transformed[:10000], ys[i][:10000], batch_size=50, epochs=10, verbose=False) pca_samples[i].append(time.time() - start) dim = n_components[4 + i] ica = FastICA(n_components=dim) transformed = ica.fit_transform(xs[i]) model = utils.create_adult_model( dim, 2) if i == 0 else utils.create_wine_model(dim, 5) start = time.time() model.fit(transformed[:10000], ys[i][:10000], batch_size=50, epochs=10, verbose=False) ica_samples[i].append(time.time() - start) if i == 1: rp = GaussianRandomProjection(eps=0.95) transformed = rp.fit_transform(xs[i]) dim = np.shape(transformed)[1] model = utils.create_wine_model(dim, 5) start = time.time() model.fit(transformed[:10000], ys[i][:10000], batch_size=50, epochs=10, verbose=False) rp_samples[i].append(time.time() - start) dim = n_components[8 + i] encoder, vae = utils.create_vae(np.shape(xs[i])[1], dim) vae.fit(xs[i], batch_size=50, epochs=10, verbose=False) transformed = encoder.predict(xs[i], verbose=False) model = utils.create_adult_model( dim, 2) if i == 0 else utils.create_wine_model(dim, 5) start = time.time() model.fit(transformed[:10000], ys[i][:10000], batch_size=50, epochs=10, verbose=False) vae_samples[i].append(time.time() - start) times = [ np.mean(none_samples[0]), np.mean(none_samples[1]), np.mean(pca_samples[0]), np.mean(pca_samples[1]), np.mean(ica_samples[0]), np.mean(ica_samples[1]), 0, np.mean(rp_samples[1]), np.mean(vae_samples[0]), np.mean(vae_samples[1]) ] times_err = [ np.std(none_samples[0]) / 2, np.std(none_samples[1]) / 2, np.std(pca_samples[0]) / 2, np.std(pca_samples[1]) / 2, np.std(ica_samples[0]) / 2, np.std(ica_samples[1]) / 2, 0, np.std(rp_samples[1]) / 2, np.std(vae_samples[0]) / 2, np.std(vae_samples[1]) / 2 ] plot.style.use('seaborn-darkgrid') plot.title(f'Influence of feature transformation on the NN training time') color = [] for _ in range(5): color.append('tab:blue') color.append('tab:orange') x = [] count = 1 for _ in range(5): x.append(count) count += 0.5 x.append(count) count += 1 plot.bar(x, times, color=color, width=0.75, yerr=times_err) x = [] count = 1.25 for _ in range(5): x.append(count) count += 1.5 plot.xticks(x, ['None', 'PCA', 'ICA', 'RP', 'VAE']) plot.xlabel('Feature transformation method') plot.ylabel('Average training time (s)') plot.show()
def gen_features(train, val, test): train = pd.DataFrame(train) val = pd.DataFrame(val) test = pd.DataFrame(test) # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year', # 'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days', # 'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel', # 'transaction_date_year', 'transaction_date_month', 'transaction_date_date', # 'membership_expire_date_year', # 'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap', # 'cancel_times', # 'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month', # 'user_date_date'] # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']] # train[cat_cols] = train[cat_cols].astype('object') # test[cat_cols] = test[cat_cols].astype('object') # val[cat_cols] = val[cat_cols].astype('object') # # for col in cat_cols: # train[col].fillna(value=train[col].mode()[0], inplace=True) # test[col].fillna(value=test[col].mode()[0], inplace=True) # val[col].fillna(value=val[col].mode()[0], inplace=True) # for col in con_cols: # train[col].fillna(value=train[col].mean(), inplace=True) # test[col].fillna(value=test[col].mean(), inplace=True) # val[col].fillna(value=val[col].mean(), inplace=True) # # for c in train.columns: # if train[c].dtype == 'object': # lbl = LabelEncoder() # lbl.fit(list(train[c].values) + list(test[c].values)) # train[c] = lbl.transform(list(train[c].values)) # test[c] = lbl.transform(list(test[c].values)) n_comp = 15 drop_list = [] test_drop_list = [] print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape) print('tSVD', datetime.now() - start) # tSVD tsvd = TruncatedSVD(n_components=n_comp) tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1)) tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1)) tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1)) print('PCA', datetime.now() - start) # PCA pca = PCA(n_components=n_comp) pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1)) pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1)) pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1)) print('ICA', datetime.now() - start) # ICA ica = FastICA(n_components=n_comp, max_iter=10000) ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1)) ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1)) ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1)) print('GRP', datetime.now() - start) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1) grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1)) grp_results_val = grp.transform(val.drop(test_drop_list, axis=1)) grp_results_test = grp.transform(test.drop(test_drop_list, axis=1)) print('SRP', datetime.now() - start) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True) srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1)) srp_results_val = srp.transform(val.drop(test_drop_list, axis=1)) srp_results_test = srp.transform(test.drop(test_drop_list, axis=1)) # MCA # res_mca = MCA(train, ncp=n_comp, graph = FALSE) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(drop_list)) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] val['pca_' + str(i)] = pca2_results_val[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] val['ica_' + str(i)] = ica2_results_val[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] val['grp_' + str(i)] = grp_results_val[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] val['srp_' + str(i)] = srp_results_val[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] return train, val, test
import numpy as np from sklearn.random_projection import GaussianRandomProjection import pandas as pd from sklearn import metrics balance_data = pd.read_csv('seeds.csv', sep=',', header=None) X = balance_data.values[:, 0:6] Y = balance_data.values[:, 7] for i in range(1, 7): gaussian = GaussianRandomProjection(n_components=6) new_X = gaussian.fit_transform(X) np.savetxt('GAUSSIAN' + str(i) + '.csv', new_X)
def gaussianRP(data, orig_dimension, new_dimension): rp = GaussianRandomProjection(n_components=new_dimension) return rp.fit_transform(data)
# PCA pca = PCA(n_components=n_comp) # random_state=400) pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1)) pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_comp) #, random_state=400) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.007, random_state=400) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=400) srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) srp_results_test = srp.transform(test) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1]
s = node.label() + ' ' + node.parent().label() + ' ' + node.parent( ).parent().label() col.append(mapo[s]) fm = sparse.csr_matrix(([1] * len(col), ([0] * len(col), col)), shape=(1, len(mapo))) O[node.label()].append(fm) for tree in tqdm(config.train, desc='NAACL collect'): for node in tree.postorder(): collect(node) rp = GaussianRandomProjection(n_components=500, random_state=42) newI, newO = dict(), dict() for k, v in tqdm(I.items(), desc='PCA/RP inside'): newI[config.nonterminal_map[k]] = rp.fit_transform(sparse.vstack(v)) for k, v in tqdm(O.items(), desc='PCA/RP outside'): newO[config.nonterminal_map[k]] = rp.fit_transform(sparse.vstack(v)) config.I = newI config.O = newO del M, counti, counto, mapi, mapo, I, O transform_trees(config.train) cnt = Counter() for tree in config.train: for node in tree.postorder(): Inode[node] = config.I[node.label()][cnt[node.label()]] Onode[node] = config.O[node.label()][cnt[node.label()]] cnt[node.label()] += 1
def gen_feature(train, test): train = pd.DataFrame(train) test = pd.DataFrame(test) n_comp = 15 drop_list = [] test_drop_list = [] print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape) print('tSVD', datetime.now() - start) # tSVD tsvd = TruncatedSVD(n_components=n_comp) tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1)) tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1)) print('PCA', datetime.now() - start) # PCA pca = PCA(n_components=n_comp) pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1)) pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1)) print('ICA', datetime.now() - start) # ICA ica = FastICA(n_components=n_comp, max_iter=10000) ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1)) ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1)) print('GRP', datetime.now() - start) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1) grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1)) grp_results_test = grp.transform(test.drop(test_drop_list, axis=1)) print('SRP', datetime.now() - start) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True) srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1)) srp_results_test = srp.transform(test.drop(test_drop_list, axis=1)) # MCA # res_mca = MCA(train, ncp=n_comp, graph = FALSE) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(drop_list)) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] return train, test
def perform_feature_engineering(train, test, config): for c in train.columns: if (len(train[c].value_counts()) == 2): if (train[c].mean() < config['SparseThreshold']): del train[c] del test[c] col = list(test.columns) if config['ID'] != True: col.remove('ID') # tSVD if (config['tSVD'] == True): tsvd = TruncatedSVD(n_components=config['n_comp']) tsvd_results_train = tsvd.fit_transform(train[col]) tsvd_results_test = tsvd.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] # PCA if (config['PCA'] == True): pca = PCA(n_components=config['n_comp']) pca2_results_train = pca.fit_transform(train[col]) pca2_results_test = pca.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] # ICA if (config['ICA'] == True): ica = FastICA(n_components=config['n_comp']) ica2_results_train = ica.fit_transform(train[col]) ica2_results_test = ica.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] # GRP if (config['GRP'] == True): grp = GaussianRandomProjection(n_components=config['n_comp'], eps=0.1) grp_results_train = grp.fit_transform(train[col]) grp_results_test = grp.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] # SRP if (config['SRP'] == True): srp = SparseRandomProjection(n_components=config['n_comp'], dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train[col]) srp_results_test = srp.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] if config['magic'] == True: magic_mat = train[['ID', 'X0', 'y']] magic_mat = magic_mat.groupby(['X0'])['y'].mean() magic_mat = pd.DataFrame({ 'X0': magic_mat.index, 'magic': list(magic_mat) }) mean_magic = magic_mat['magic'].mean() train = train.merge(magic_mat, on='X0', how='left') test = test.merge(magic_mat, on='X0', how='left') test['magic'] = test['magic'].fillna(mean_magic) return train, test
import sys from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection import numpy as np in_dim = int(sys.argv[1]) out_dim = int(sys.argv[2]) out_file = sys.argv[3] # dummy data X = np.zeros((2, in_dim), dtype=float) g = GaussianRandomProjection(out_dim) g.fit_transform(X) # random mat, transpose() from (out_d, int_d) to (in_d, out_d) random_mat = g.components_.transpose() random_mat.dump(out_file)
def retdata(): # load data into dataframe window_size = 7 df = pd.read_csv('ds.csv') df = df.drop('Unnamed: 0', 1) # add data part to dataframe df['datepart'] = pd.DatetimeIndex(pd.to_datetime(df['time'])).date # split dataframe up into the variable where we want the sum and where we want the mean means = ['mood', 'circumplex.valence', 'circumplex.arousal'] df2 = df[df['variable'].isin(means)] df = df[~df['variable'].isin(means)] # create 2 different dataframes with different aggfunc and merge them pt1 = pd.pivot_table(df, values='value', index=['id', 'datepart'], aggfunc='sum', columns='variable').reset_index() # 1973 rows pt2 = pd.pivot_table(df2, values='value', index=['id', 'datepart'], aggfunc='mean', columns='variable').reset_index() # 1973 rows pt = pd.merge(pt1, pt2, how='left', left_on=['id', 'datepart'], right_on=['id', 'datepart']) # remove rows with no mood, valence, and arousal value--1268 rows ptcl1 = pt[np.isfinite(pt.mood)] # 1268 rows ptcl2 = ptcl1[np.isfinite(ptcl1['circumplex.valence'])] # 1266 rows ptcl3 = ptcl2[np.isfinite(ptcl2['circumplex.arousal'])] # 1266 rows ptcl3['weekday'] = pd.to_datetime(ptcl3['datepart']).dt.weekday_name le = sklearn.preprocessing.LabelEncoder() ptcl3['weekday'] = le.fit_transform(ptcl3['weekday']) ptcl3 = ptcl3.sort_values(by=['id', 'datepart']) new_feature_list = ['id', 'datepart', 'weekday', 'mood_'] for feature_name in ptcl3.columns: if feature_name not in ['id', 'datepart', 'sms', 'call', 'weekday']: # new_feature_list.append((feature_name + 'PrevDay')) new_feature_list.append((feature_name + 'MeanPrevDays')) new_feature_list.append((feature_name + 'Gradient')) new_feature_list.append((feature_name + 'Log')) elif feature_name not in ['id', 'datepart', 'weekday']: # new_feature_list.append((feature_name + 'PrevDay')) new_feature_list.append((feature_name + 'SumPrevDays')) new_feature_list.append((feature_name + 'Gradient')) the_df = pd.DataFrame() # the_df.columns = the_df.loc[0,:] # the_df = the_df.drop(0) # the_df = the_df.fillna(0) ptcl3 = ptcl3.fillna(0) # add previous day's mood id_set = list(OrderedDict.fromkeys(ptcl3['id'])) for person in id_set: persondf = ptcl3[ptcl3['id'] == person] for feature_name in persondf.columns: if feature_name == 'mood': persondf['mood_'] = persondf[ 'mood'] # all original feature names will be removed, hence the new name if feature_name not in [ 'id', 'datepart', 'call', 'sms', 'weekday' ]: # persondf[str(feature_name)+'PrevDay'] = persondf[feature_name].shift(1) # persondf[str(feature_name)+'PrevDay'] = persondf[str(feature_name)+'PrevDay'].fillna(0) persondf[str(feature_name) + 'MeanPrevDays'] = persondf[str( feature_name)].rolling(window_size).mean() persondf[str(feature_name) + 'Gradient'] = np.gradient( persondf[str(feature_name)].rolling(window_size).mean()) persondf[str(feature_name) + 'Log'] = np.log( persondf[str(feature_name)]) persondf[str(feature_name) + 'Log'][np.isneginf( persondf[str(feature_name) + 'Log'])] = 0 persondf = persondf.drop(feature_name, 1) elif feature_name not in [ 'id', 'datepart', 'weekday' ]: # looking at the sum instead of the mean of the previous days for sms and call # persondf[str(feature_name)+'PrevDay'] = persondf[feature_name].shift(1) # persondf[str(feature_name)+'PrevDay'] = persondf[str(feature_name)+'PrevDay'].fillna(0) persondf[str(feature_name) + 'SumPrevDays'] = persondf[str( feature_name)].rolling(window_size).sum() persondf[str(feature_name) + 'Gradient'] = np.gradient( persondf[str(feature_name)].rolling(window_size).mean()) persondf = persondf.drop(feature_name, 1) persondf = persondf[persondf['activityGradient'].notnull( )] # arbritrary feature to remove the first 6 days persondf = persondf.fillna(0) pca = PCA(n_components=5) tsvd = TruncatedSVD(n_components=5) gp = GaussianRandomProjection(n_components=5) sp = SparseRandomProjection(n_components=5, dense_output=True) x_pca = pd.DataFrame( pca.fit_transform( persondf.drop(['mood_', 'id', 'datepart', 'weekday'], axis=1))) x_tsvd = pd.DataFrame( tsvd.fit_transform( persondf.drop(['mood_', 'id', 'datepart', 'weekday'], axis=1))) x_gp = pd.DataFrame( gp.fit_transform( persondf.drop(['mood_', 'id', 'datepart', 'weekday'], axis=1))) x_sp = pd.DataFrame( sp.fit_transform( persondf.drop(['mood_', 'id', 'datepart', 'weekday'], axis=1))) x_pca.columns = ["pca_{}".format(i) for i in x_pca.columns] x_tsvd.columns = ["tsvd_{}".format(i) for i in x_tsvd.columns] x_gp.columns = ["gp_{}".format(i) for i in x_gp.columns] x_sp.columns = ["sp_{}".format(i) for i in x_sp.columns] x_pca = x_pca.reset_index() x_tsvd = x_tsvd.reset_index() x_gp = x_gp.reset_index() x_sp = x_sp.reset_index() persondf = persondf.reset_index() persondf = pd.concat((persondf, x_pca), axis=1) persondf = pd.concat((persondf, x_tsvd), axis=1) persondf = pd.concat((persondf, x_gp), axis=1) persondf = pd.concat((persondf, x_sp), axis=1) the_df = the_df.append(persondf) the_df = the_df.fillna(0) # replace null with 0 and reindex cleandata = the_df.fillna(0) cleandata.index = range(len(cleandata.values)) # clean up del ptcl1, ptcl2, ptcl3, pt, pt1, pt2, df, df2, means # get normalized datasets cleandata = cleandata.drop(cleandata.columns[[60, 66, 72, 78]], axis=1) normalizedwholeds = normalize(cleandata) normalizedperuser = normalizeperuser(cleandata) return normalizedwholeds, normalizedperuser, cleandata
n_outputs = 500 X = 3 + 5 * random_state.normal(size=(n_samples, n_outputs)) # Let's compute the sum of the variance in the orignal output space var_origin = np.var(X, axis=0).sum() # Let's compute the variance on a random subspace all_n_components = np.array([1, 50, 100, 200, 400, 500]) n_repetitions = 10 distortion = np.empty((len(all_n_components), n_repetitions)) for i, n_components in enumerate(all_n_components): for j in range(n_repetitions): transformer = GaussianRandomProjection(n_components=n_components, random_state=random_state) X_subspace = transformer.fit_transform(X) distortion[i, j] = np.var(X_subspace, axis=0).sum() / var_origin # Let's plot the distortion as a function of the compression ratio distortion_mean = distortion.mean(axis=1) distortion_std = distortion.std(axis=1) plt.figure() plt.plot(all_n_components / n_outputs, distortion_mean, "o-", color="g") plt.plot(all_n_components / n_outputs, np.ones_like(distortion_mean), "--", color="r") plt.fill_between(all_n_components / n_outputs, distortion_mean - distortion_std, distortion_mean + distortion_std, alpha=0.25, color="g") plt.xlabel("n_components / n_outputs") plt.ylabel('Distortion of the variance on a Gaussian subspace')
def ft_red_select(x, y, choice, no_normalize, dis_kept_features, num_features=30): """ :param 'full_file_name', which is the full path name to the file in question that we wish to do dimensionality reduction on :return: the new reduced 'x' and 'y' components of the file to be later written to a new file """ #Normalize the data if not no_normalize: x = normalize(x) #Given the argument choice of feature selection/reduction, creates the relevant object, fits the 'x' data to it, #and reduces/transforms it to a lower dimensionality new_x = [] print("Original 'x' shape:", np.shape(x)) if choice == "pca": pca = PCA(n_components=num_features) new_x = pca.fit_transform(x) print("Explained variance = " + str(round(sum(pca.explained_variance_) * 100, 2)) + "%") elif choice == "grp": grp = GaussianRandomProjection(n_components=num_features) new_x = grp.fit_transform(x) elif choice == "agglom": agg = FeatureAgglomeration(n_clusters=num_features) new_x = agg.fit_transform(x) elif choice == "thresh": #Below threshold gives ~26 components upon application vt = VarianceThreshold(threshold=0.00015) new_x = vt.fit_transform(x) print("Explained variance = " + str(round(sum(vt.variances_) * 100, 2)) + "%") kept_features = list(vt.get_support(indices=True)) if dis_kept_features: print("Kept features: ") for i in kept_features: print(col_names[i]) elif choice == "rf": y_labels = [1 if s == "D" else 0 for s in y[:, 1]] clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) print("Fitting RF model....") clf.fit(x, y_labels) sfm = SelectFromModel(clf, threshold=-np.inf, max_features=num_features) print("Selecting best features from model...") sfm.fit(x, y_labels) kept_features = list(sfm.get_support(indices=True)) if dis_kept_features: print("Kept features: ") for i in kept_features: print(col_names[i]) new_x = x[:, kept_features] print("Reduced 'x' shape:", np.shape(new_x)) return new_x, y
# tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results = tsvd.fit_transform(train_test_p) # PCA pca = PCA(n_components=n_comp, random_state=420) pca_results = pca.fit_transform(train_test_p) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica_results = ica.fit_transform(train_test_p) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results = grp.fit_transform(train_test_p) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results = srp.fit_transform(train_test_p) # save columns list before adding the decomposition components usable_columns = train_test_p.columns print train_test_p.shape print tsvd_results.shape, type(tsvd_results) # # Append decomposition components to datasets for i in range(1, n_comp + 1):
plt.close() Cancer_X, Cancer_y = ReadData.ReadCancerData() Cancer_X = Cancer_X.iloc[:, :-1].values Cancer_X = preprocessing.scale(Cancer_X) Cancer_X_train, Cancer_X_test, Cancer_y_train, Cancer_y_test = train_test_split(Cancer_X, Cancer_y,random_state= 7641, test_size=0.2) dims = range(2,30) aa = defaultdict(dict) for i,dim in product(range(10),dims): rp = GaussianRandomProjection(random_state=i, n_components=dim) aa[dim][i] = DistCorr(rp.fit_transform(Cancer_X_train), Cancer_X_train) aa = pd.DataFrame(aa).T mean_recon = aa.mean(axis=1).tolist() fig, ax1 = plt.subplots() ax1.plot(dims,mean_recon) ax1.set_xlabel('Components') ax1.set_ylabel('Pair Wise DisCorr') plt.grid(linestyle='-', linewidth=1, axis = "x") plt.title("Random Components Pair Wise DisCorr Cancer") plt.savefig('Cancer_RP.png') plt.show() plt.close() Cancer_RP = GaussianRandomProjection(n_components=13,random_state=7641).fit_transform(Cancer_X)
X_agg.head() # In[14]: from sklearn.decomposition import FactorAnalysis fa = FactorAnalysis(n_components=50, random_state=42) X_fa = fa.fit_transform(X) # In[15]: from sklearn.random_projection import GaussianRandomProjection grp = GaussianRandomProjection(n_components=50, random_state=42, eps=0.1) X_grp = grp.fit_transform(X) # In[16]: # from sklearn.decomposition import PCA # pca = PCA(n_components=100, random_state=42) # X_pca = pca.fit_transform(X) # In[17]: # from sklearn.decomposition import FastICA # ica = FastICA(n_components=15, random_state=42) # X_ica = ica.fit_transform(X)
def get_dc_feature(df_train, df_test, n_comp=12, id_column=None, label_column=None): """ 构造分解特征 """ train = df_train.copy() test = df_test.copy() if id_column: train_id = train[id_column] test_id = test[id_column] train = drop_columns(train, [id_column]) test = drop_columns(test, [id_column]) if label_column: train_y = train[label_column] train = drop_columns(train, [label_column]) # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results_train = tsvd.fit_transform(train) tsvd_results_test = tsvd.transform(test) # PCA pca = PCA(n_components=n_comp, random_state=420) pca2_results_train = pca.fit_transform(train) pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train) srp_results_test = srp.transform(test) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] if id_column: train[id_column] = train_id test[id_column] = test_id if label_column: train[label_column] = train_y return train, test
# In[155]: n_random_comp = 7 # In[156]: random_proj = GaussianRandomProjection(n_components=n_random_comp) # In[157]: X_random_proj = random_proj.fit_transform(X_scaled) # In[158]: df_random_proj=pd.DataFrame(data=X_random_proj,columns=['Random_projection'+str(i) for i in range(1,n_random_comp+1)]) # ### Running k-means on random projections # In[159]: km_sse= [] km_silhouette = []
tsvd_results_train = tsvd.fit_transform(train_df) tsvd_results_test = tsvd.transform(test_df) # PCA pca = PCA(n_components=n_comp, random_state=420) pca2_results_train = pca.fit_transform(train_df) pca2_results_test = pca.transform(test_df) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train_df) ica2_results_test = ica.transform(test_df) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train_df) grp_results_test = grp.transform(test_df) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train_df) srp_results_test = srp.transform(test_df) ############ # write output ############# print("Writing output...") outputTrain = pd.DataFrame() outputTest = pd.DataFrame()
# power_t : double, optional, default 0.5 # max_iter : int, optional, default 200 # shuffle : bool, optional, default True # random_state : int, RandomState instance or None, optional, default None # tol : float, optional, default 1e-4 # early_stopping : bool, default False # validation_fraction : float, optional, default 0.1; only used if early_stopping True clf = GaussianRandomProjection( random_state=0, n_components=20, ) print(clf) X_train = clf.fit_transform(X_train) X_test = clf.fit_transform(X_test) train_results = [] test_results = [] clf = MLPClassifier( random_state=0, hidden_layer_sizes=(100), activation='relu', solver='adam', batch_size=100, early_stopping=True, beta_1=.001, beta_2=.999, )
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1)) tsvd_results_test = tsvd.transform(test) # PCA pca = PCA(n_components=n_comp, random_state=420) pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1)) pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) srp_results_test = srp.transform(test) # Append decomposition components to datasets for i in range(1, n_comp+1): train['pca_' + str(i)] = pca2_results_train[:,i-1] test['pca_' + str(i)] = pca2_results_test[:, i-1] train['ica_' + str(i)] = ica2_results_train[:,i-1] test['ica_' + str(i)] = ica2_results_test[:, i-1]
start_time = time.time() # Load the data from income_data import X, y, X_train, X_test, y_train, y_test # Scale the data scaler = StandardScaler() scaler.fit(X) X_train_std = scaler.transform(X) X_test_std = scaler.transform(X) X_toCluster = X_train_std y_inputs = y # Reduce Dimensionality (Randomized Projections) projection = ProjectionAlgorithm(n_components=22) X_toCluster = projection.fit_transform(X_toCluster) ###### # Run k-means clustering with 1:n clusters determine scores for each ###### scores = [] silhouette_avg = [] BIC = [] maxClusters = 100 minClusters = 1 for i in range(minClusters, maxClusters): kmeans = KMeans(n_clusters=i + 1, random_state=0) cluster_labels = kmeans.fit_predict(X_toCluster) scores.append(kmeans.score(X_toCluster)) silhouette_avg.append(silhouette_score(X, cluster_labels)) BIC.append(compute_bic(kmeans, X_toCluster))
from load_mydata import LoadData import math mushroom = LoadData("mushroom") data = scale(mushroom.data) labels = np.array(mushroom.labels) n_samples, n_features = data.shape n_digits = len(np.unique(labels)) n_iter = 1000 print("n_digits: %d, \t n_samples %d, \t n_features %d" % (n_digits, n_samples, n_features)) t0 = time() rp = GaussianRandomProjection(n_components=20) reduced_data = rp.fit_transform(data) print("time spent: %0.3fs" % (time()-t0)) #reduced_data = data # Plot the data fig=plt.figure() #plt.clf() n_plots=9 h = 0.02 x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max() y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max() for index in range(1,n_plots+1): vert=math.floor(math.sqrt(n_plots)) hori=n_plots/vert fig.add_subplot(vert,hori,index) i,j = 2*index-2, 2*index-1
# Perform Truncated Singular Value Decomposition (SVD) from sklearn.decomposition import TruncatedSVD as TruncSVD tsvd = TruncSVD(n_components=num_components, algorithm='randomized', random_state=0) tsvd_transformed_data_train = tsvd.fit_transform(sparse_trainData) tsvd_transformed_data_valid = tsvd.transform(sparse_validData) # Perform Randomized Principal Components Analysis (PCA) from sklearn.decomposition import RandomizedPCA as RPCA rpca = RPCA(n_components=num_components) rpca_transformed_data_train = rpca.fit_transform(dense_trainData) rpca_transformed_data_valid = rpca.transform(dense_validData) # Perform Gaussian Random Projection from sklearn.random_projection import GaussianRandomProjection as GaussRan grp = GaussRan(n_components=num_components) grp_transformed_data_train = grp.fit_transform(dense_trainData) grp_transformed_data_valid = grp.transform(dense_validData) # Perform Sparse Random Projection from sklearn.random_projection import SparseRandomProjection as SparseRan srp = SparseRan(n_components=num_components, random_state=0) srp_transformed_data_train = srp.fit_transform(dense_trainData) srp_transformed_data_valid = srp.transform(dense_validData) # Perform classification using 1-Nearest Neighbor Classifier from sklearn.neighbors import KNeighborsClassifier # Create a subset grid to plot performance against numbers of components tsvd_max = tsvd_transformed_data_train.shape[1] plot_subset = [] length_of_plot_subset = len(plot_subset)
import itertools from scipy import linalg import matplotlib as mpl from sklearn import mixture df = pd.read_csv('tic-tac-toe.data', sep=",", skiprows=0) df = np.array(df) print(df.shape, df.dtype) dat = df[:, 0:9] tar1 = df[:, 9] X = dat y = tar1 rp1 = GaussianRandomProjection(n_components=2) X1 = rp1.fit_transform(X) plt.figure() for i in range(len(y)): if y[i] == 0: plt.scatter(X1[i, 0], X1[i, 1], color='r') elif y[i] == 1: plt.scatter(X1[i, 0], X1[i, 1], color='b') plt.title('visualization of data in 2D (rp)-> tic-tac toe dataset') plt.show() r = np.array([7, 17, 37, 57, 77]) plt.figure() for m in range(5): x1 = [] # e1=[]
X = data.iloc[:, :41] y = data.iloc[:, 41] scaler = MinMaxScaler(feature_range=[0, 100]) from sklearn.preprocessing import StandardScaler X_norm = StandardScaler().fit_transform(X) ### pca = PCA(n_components=10, random_state=10) X_r = pca.fit(X).transform(X) X_pca = X_r #### ica = FastICA(n_components=10, random_state=10) X_r = ica.fit(X).transform(X) X_ica = X_r #### rca = GaussianRandomProjection(n_components=10, random_state=10) X_r = rca.fit_transform(X_norm) X_rca = X_r #### svd = SVD(n_components=2) X_r = svd.fit_transform(X_norm) X_svd = X_r clf = MLPClassifier(hidden_layer_sizes=(82, 82, 82), alpha=0.316227766, learning_rate_init=0.016, random_state=0, solver="lbfgs") clusterer = KMeans(n_clusters=10, random_state=10).fit(X_pca) y_kmeans = clusterer.labels_ X_df = pd.DataFrame(X_pca)
def Random_Projection(M, new_dim, prng): proj = GaussianRandomProjection(n_components=new_dim, eps=0.1, random_state=None) return proj.fit_transform(M)
int10 = int10 / max(int10) df_non_obj_feats['binSum'] = df_non_obj_feats.apply(sum, 1) df_non_obj_feats['binDec'] = int10 all_data_proc = pd.concat((df_obj_feats_freq, df_non_obj_feats), axis=1) #%% from sklearn.decomposition import PCA, FastICA from sklearn.random_projection import GaussianRandomProjection from sklearn.random_projection import SparseRandomProjection n_comp = 12 # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results = grp.fit_transform(all_data_proc) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results = srp.fit_transform(all_data_proc) # PCA pca = PCA(n_components=n_comp, random_state=420) pca_results = pca.fit_transform(all_data_proc) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica_results = ica.fit_transform(all_data_proc) for i in range(1, n_comp+1): all_data_proc['pca_' + str(i)] = pca_results[:,i-1] all_data_proc['ica_' + str(i)] = ica_results[:, i-1]
def gaussianRP(data): rp = GaussianRandomProjection(n_components=new_dimension) return rp.fit_transform(data)