def project(file_name, dimensions): data = load_svmlight_file(file_name) projector = SparseRandomProjection(dimensions, 1/3.0, dense_output=True) projected = projector.fit_transform(data[0]) new_file_name = file_name[:-4] + '-' + str(dimensions) + '.mat' new_file = open(new_file_name, 'wb') dump_svmlight_file(projected, data[1], new_file)
def plotProjection(data, n_samples, n_features): n_components_range = np.array([300, 1000, 10000]) dists = euclidean_distances(data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range: t0 = time() rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(data) print("Projected %d samples from %d to %d in %.3fs" \ % (n_samples, \ n_features, \ n_components, \ time() - t0)) if hasattr(rp, 'components_'): n_bytes = rp.components_.data.nbytes n_bytes += rp.components_.indices.nbytes print("Random matrix with size: %.3fMB" % (n_bytes / 1e6)) projected_dists = euclidean_distances(projected_data, squared=True) projected_dists = projected_dists.ravel()[nonzero] rates = projected_dists / dists print("Mean distances rate: %.2f (%.2f)" \ % (np.mean(rates), \ np.std(rates))) plotHexbin(dists, projected_dists, n_components) plotHist(rates, n_components)
def test_SparseRandomProjection_output_representation(): for SparseRandomProjection in all_SparseRandomProjection: # when using sparse input, the projected data can be forced to be a # dense numpy array rp = SparseRandomProjection(n_components=10, dense_output=True, random_state=0) rp.fit(data) assert isinstance(rp.transform(data), np.ndarray) sparse_data = sp.csr_matrix(data) assert isinstance(rp.transform(sparse_data), np.ndarray) # the output can be left to a sparse matrix instead rp = SparseRandomProjection(n_components=10, dense_output=False, random_state=0) rp = rp.fit(data) # output for dense input will stay dense: assert isinstance(rp.transform(data), np.ndarray) # output for sparse output will be sparse: assert sp.issparse(rp.transform(sparse_data))
def create_sector_subset(sample_n, X_output_path, Y_output_path): X_path = "/cs/puls/Experiments/hxiao-test/feature-data.mat" Y_path = "/cs/puls/Experiments/hxiao-test/label-data.mat" X = loadmat(X_path)["featureData"] Y = loadmat(Y_path)["labelData"] print "Applying random projection to reduce dimension" print "Shape before: %r" % (X.shape,) transformer = SparseRandomProjection(random_state=0) X = transformer.fit_transform(X) print "Shape after: %r" % (X.shape,) print "Random projection: OFF" rng = np.random.RandomState(0) print "Sample size: %d" % sample_n rows = rng.permutation(X.shape[0])[:sample_n] X = X[rows, :] Y = Y[rows, :] dump(X, open(X_output_path, "w")) dump(Y, open(Y_output_path, "w"))
def main(): global global_gen_data global total_length with open('feature_select_list.pkl', 'r') as f: feature_select_list = pickle.load(f) #pdb.set_trace() cores = multiprocessing.cpu_count() #21 for file_number in xrange(1): with open('../order_100_data/order_data_chunk_' + str(file_number), 'r') as f: file_list = f.readlines() print('read done:' + str(file_number)) get_all_label(file_list) # cores = multiprocessing.cpu_count() # pool = multiprocessing.Pool(processes=(cores-2)) #pdb.set_trace() #print('length: ',len(all_label_result['usercategories'])) cut_num = 2000 control_feature_length(cut_num) #save_pickle(all_label_result,'all_label.pkl') #pdb.set_trace() for feature in total_list: enc, one_hot = get_all_onehot(feature, list(all_label_result[feature])) all_label_encoder[feature].extend([enc, one_hot]) # rewards = [] # items_id = [] # uin = [] # for file_number in range(2,16): # with open('../order_100_event_data/order_data_id_label_chunk_' + str(file_number), 'r') as f: # file_list = f.readlines() # #pdb.set_trace() # for line in file_list: # line_list = line.split('\t') # #if len(line_list) < 3: # #print(line_list) # rewards.append(line_list[1]) # items_id.append(line_list[0]) # uin.append(line_list[2].strip('\n')) for line in cross_lines: cross_feat = line.strip().split() feat_a = cross_feat[0] feat_b = cross_feat[1] total_length += (feature_length_result[feat_a] * feature_length_result[feat_b]) srp = SparseRandomProjection(n_components=1000) print('total_d_length', total_length) for file_number in xrange(0, 4): rewards = [] items_id = [] uin = [] with open( '../order_new_pool_data/order_data_id_label_chunk_' + str(file_number), 'r') as f: file_list = f.readlines() #pdb.set_trace() for line in file_list: line_list = line.split('\t') #if len(line_list) < 3: #print(line_list) rewards.append(line_list[1]) items_id.append(line_list[0]) uin.append(line_list[2].strip('\n')) with open( '../order_new_pool_data/order_data_chunk_' + str(file_number), 'r') as f: file_list = f.readlines() #pdb.set_trace() gen_data = generate_key_value_data(file_list) with open('../order_new_pool_data/length_chunk_' + str(file_number), 'r') as f: cut_pool_list = pickle.load(f) #gen_data = gen_data[0:100] print('start file: ' + str(file_number)) print('number chunk', len(cut_pool_list) / 4000) chunk_file_number = len(cut_pool_list) / 4000 pdb.set_trace() cut_start_flag = 0 for block_num in range(chunk_file_number): print('-------------------------------') print('strat block: ' + str(block_num + 1)) cut_pool = cut_pool_list[block_num * 4000:(block_num + 1) * 4000] cut_end = sum(cut_pool) print('chunk_range: ', cut_start_flag, cut_end + cut_start_flag) data_todeal = gen_data[cut_start_flag:(cut_end + cut_start_flag)] rewards_todeal = rewards[cut_start_flag:(cut_end + cut_start_flag)] items_todeal = items_id[cut_start_flag:(cut_end + cut_start_flag)] uin_todeal = uin[cut_start_flag:(cut_end + cut_start_flag)] cut_start_flag += cut_end pdb.set_trace()
def DecomposedFeatures(train, test, total, addtrain, addtest, use_pca=0.0, use_tsvd=0.0, use_ica=0.0, use_fa=0.0, use_grp=0.0, use_srp=0.0, use_pls=0.0): print("\nStart decomposition process...") train_decomposed = [addtrain] test_decomposed = [addtest] if use_pca > 0.0: print("PCA") N_COMP = int(use_pca * train.shape[1]) + 1 pca = PCA(n_components=N_COMP, whiten=True, svd_solver="full", random_state=42) pca_results = pca.fit(total) pca_results_train = pca.transform(train) pca_results_test = pca.transform(test) train_decomposed = train_decomposed.append(pca_results_train) test_decomposed = test_decomposed.append(pca_results_test) if use_tsvd > 0.0: print("tSVD") N_COMP = int(use_tsvd * train.shape[1]) + 1 tsvd = TruncatedSVD(n_components=N_COMP, random_state=42) tsvd_results = tsvd.fit(total) tsvd_results_train = tsvd.transform(train) tsvd_results_test = tsvd.transform(test) train_decomposed = train_decomposed.append(tsvd_results_train) test_decomposed = test_decomposed.append(tsvd_results_test) if use_ica > 0.0: print("ICA") N_COMP = int(use_ica * train.shape[1]) + 1 ica = FastICA(n_components=N_COMP, random_state=42) ica_results = ica.fit(total) ica_results_train = ica.transform(train) ica_results_test = ica.transform(test) train_decomposed = train_decomposed.append(train_decomposed) test_decomposed = test_decomposed.append(ica_results_test) if use_fa > 0.0: print("FA") N_COMP = int(use_fa * train.shape[1]) + 1 fa = FactorAnalysis(n_components=N_COMP, random_state=42) fa_results = fa.fit(total) fa_results_train = fa.transform(train) fa_results_test = fa.transform(test) train_decomposed = train_decomposed.append(fa_results_train) test_decomposed = test_decomposed.append(fa_results_test) if use_grp > 0.0: print("GRP") N_COMP = int(use_grp * train.shape[1]) + 1 grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=42) grp_results = grp.fit(total) grp_results_train = grp.transform(train) grp_results_test = grp.transform(test) train_decomposed = train_decomposed.append(grp_results_train) test_decomposed = test_decomposed.append(grp_results_test) if use_srp > 0.0: print("SRP") N_COMP = int(use_srp * train.shape[1]) + 1 srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=42) srp_results = srp.fit(total) srp_results_train = srp.transform(train) srp_results_test = srp.transform(test) train_decomposed = train_decomposed.append(srp_results_train) test_decomposed = test_decomposed.append(srp_results_test) if use_pls > 0.0: print("PLS") #N_COMP = int(use_pls * train.shape[1]) +1 #pls = PLSCanonical(n_components = N_COMP) #pls_results = pls.fit(total) #pls_results_train = pls.transform(train) #pls_results_test = pls.transform(test) #train_decomposed = np.concatenate([pls_results_train,train_decomposed], axis=1) #test_decomposed = np.concatenate([pls_results_test, test_decomposed], axis=1) print("Append decomposition components together...") train_decomposed = np.concatenate(train_decomposed, axis=1) test_decomposed = np.concatenate(test_decomposed, axis=1) train_with_only_decomposed_features = pd.DataFrame(train_decomposed) test_with_only_decomposed_features = pd.DataFrame(test_decomposed) #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']: # train_with_only_decomposed_features[col] = train[col] # test_with_only_decomposed_features[col] = test[col] np.concatenate([ srp_results_train, grp_results_train, ica_results_train, pca_results_train, tsvd_results_train ], axis=1) # Remove any NA train_with_only_decomposed_features = train_with_only_decomposed_features.fillna( 0) test_with_only_decomposed_features = test_with_only_decomposed_features.fillna( 0) return train_with_only_decomposed_features, test_with_only_decomposed_features
n_features=256, sep=',', header=None) wineX = StandardScaler().fit_transform(wineX) digitX = StandardScaler().fit_transform(digitX) clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] dims_wine = [i for i in range(2, 12)] # data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), dims_wine): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'wine scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitX), digitX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digit scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_wine): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(wineX)
def main(): runit = 1 if runit: run = assignment4() run.read_data_voice('voice.csv') run.dataSetName = 'Voice' run.split_data_to_train_test(testSize=0.3) dataX = StandardScaler().fit_transform(run.allFeatures) ''' run.PCA() run.ICA() run.RP() ''' run.TSVD() run.k_mean_cluster() run.expectation_maximization() pcaCom = 15 icaCom = 15 rpCom = 15 tsvdCom = 15 k = 2 reducedDataPCA = PCA(n_components=pcaCom, random_state=5).fit_transform(dataX) run.k_mean_cluster_reduced(k, reducedDataPCA, 'PCA') run.expectation_maximization_reduced(k, reducedDataPCA, 'PCA') reducedDataICA = FastICA(n_components=icaCom, random_state=5).fit_transform(dataX) run.k_mean_cluster_reduced(k, reducedDataICA, 'ICA') run.expectation_maximization_reduced(k, reducedDataICA, 'ICA') reducedDataRP = SparseRandomProjection( n_components=rpCom, random_state=5).fit_transform(dataX) run.k_mean_cluster_reduced(k, reducedDataRP, 'RP') run.expectation_maximization_reduced(k, reducedDataRP, 'RP') reducedDataTSVD = TruncatedSVD( random_state=5, n_components=tsvdCom).fit_transform(dataX) run.k_mean_cluster_reduced(k, reducedDataTSVD, 'TSVD') run.expectation_maximization_reduced(k, reducedDataTSVD, 'TSVD') run_hapt = assignment4() run_hapt.read_data_haptX('HAPT_X.csv') run_hapt.read_data_haptY('HAPT_Y.csv') run_hapt.dataSetName = 'HAPT' dataX = StandardScaler().fit_transform(run_hapt.allFeatures) run_hapt.kNum = range(1, 20, 5) run_hapt.pcaDims = range(1, 561, 25) run_hapt.icaDims = range(1, 561, 25) run_hapt.rpDims = range(1, 561, 25) run_hapt.tvsdDims = range(1, 561, 25) #run_hapt.k_mean_cluster() run_hapt.expectation_maximization() run_hapt.PCA() run_hapt.ICA() run_hapt.RP() run_hapt.TSVD() pcaCom = 15 icaCom = 15 rpCom = 15 tsvdCom = 15 k = 2 reducedDataPCA = PCA(n_components=pcaCom, random_state=5).fit_transform(dataX) run_hapt.k_mean_cluster_reduced(k, reducedDataPCA, 'PCA') run_hapt.expectation_maximization_reduced(k, reducedDataPCA, 'PCA') reducedDataICA = FastICA(n_components=icaCom, random_state=5).fit_transform(dataX) run_hapt.k_mean_cluster_reduced(k, reducedDataICA, 'ICA') run_hapt.expectation_maximization_reduced(k, reducedDataICA, 'ICA') reducedDataRP = SparseRandomProjection(n_components=rpCom, random_state=5).fit_transform(dataX) run_hapt.k_mean_cluster_reduced(k, reducedDataRP, 'RP') run_hapt.expectation_maximization_reduced(k, reducedDataRP, 'RP') reducedDataTSVD = TruncatedSVD(random_state=5, n_components=tsvdCom).fit_transform(dataX) run_hapt.k_mean_cluster_reduced(k, reducedDataTSVD, 'TSVD') run_hapt.expectation_maximization_reduced(k, reducedDataTSVD, 'TSVD') print("All done") plt.show()
def sparseRP(data): rp = SparseRandomProjection(n_components=new_dimension) return rp.fit_transform(data)
def _get_projection(n_samples, n_features, density='auto', eps=0.1): p = SparseRandomProjection(density=density, eps=eps) mat = csr_matrix((n_samples, n_features)) return p.fit(mat)
random_state=42, max_iter=1000, tol=.008) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_grp, eps=0.1, random_state=42) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_srp, dense_output=True, random_state=42) srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) srp_results_test = srp.transform(test) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(['y'])) # Append decomposition components to datasets print("Append PCA components to datasets...") for i in range(1, n_pca + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] print("Append ICA components to datasets...") for i in range(1, n_ica + 1):
def gen_features(train, val, test): train = pd.DataFrame(train) val = pd.DataFrame(val) test = pd.DataFrame(test) # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year', # 'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days', # 'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel', # 'transaction_date_year', 'transaction_date_month', 'transaction_date_date', # 'membership_expire_date_year', # 'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap', # 'cancel_times', # 'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month', # 'user_date_date'] # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']] # train[cat_cols] = train[cat_cols].astype('object') # test[cat_cols] = test[cat_cols].astype('object') # val[cat_cols] = val[cat_cols].astype('object') # # for col in cat_cols: # train[col].fillna(value=train[col].mode()[0], inplace=True) # test[col].fillna(value=test[col].mode()[0], inplace=True) # val[col].fillna(value=val[col].mode()[0], inplace=True) # for col in con_cols: # train[col].fillna(value=train[col].mean(), inplace=True) # test[col].fillna(value=test[col].mean(), inplace=True) # val[col].fillna(value=val[col].mean(), inplace=True) # # for c in train.columns: # if train[c].dtype == 'object': # lbl = LabelEncoder() # lbl.fit(list(train[c].values) + list(test[c].values)) # train[c] = lbl.transform(list(train[c].values)) # test[c] = lbl.transform(list(test[c].values)) n_comp = 15 drop_list = [] test_drop_list = [] print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape) print('tSVD', datetime.now() - start) # tSVD tsvd = TruncatedSVD(n_components=n_comp) tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1)) tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1)) tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1)) print('PCA', datetime.now() - start) # PCA pca = PCA(n_components=n_comp) pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1)) pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1)) pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1)) print('ICA', datetime.now() - start) # ICA ica = FastICA(n_components=n_comp, max_iter=10000) ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1)) ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1)) ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1)) print('GRP', datetime.now() - start) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1) grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1)) grp_results_val = grp.transform(val.drop(test_drop_list, axis=1)) grp_results_test = grp.transform(test.drop(test_drop_list, axis=1)) print('SRP', datetime.now() - start) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True) srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1)) srp_results_val = srp.transform(val.drop(test_drop_list, axis=1)) srp_results_test = srp.transform(test.drop(test_drop_list, axis=1)) # MCA # res_mca = MCA(train, ncp=n_comp, graph = FALSE) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(drop_list)) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] val['pca_' + str(i)] = pca2_results_val[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] val['ica_' + str(i)] = ica2_results_val[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] val['grp_' + str(i)] = grp_results_val[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] val['srp_' + str(i)] = srp_results_val[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] return train, val, test
def gen_feature(train, test): train = pd.DataFrame(train) test = pd.DataFrame(test) n_comp = 15 drop_list = [] test_drop_list = [] print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape) print('tSVD', datetime.now() - start) # tSVD tsvd = TruncatedSVD(n_components=n_comp) tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1)) tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1)) print('PCA', datetime.now() - start) # PCA pca = PCA(n_components=n_comp) pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1)) pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1)) print('ICA', datetime.now() - start) # ICA ica = FastICA(n_components=n_comp, max_iter=10000) ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1)) ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1)) print('GRP', datetime.now() - start) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1) grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1)) grp_results_test = grp.transform(test.drop(test_drop_list, axis=1)) print('SRP', datetime.now() - start) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True) srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1)) srp_results_test = srp.transform(test.drop(test_drop_list, axis=1)) # MCA # res_mca = MCA(train, ncp=n_comp, graph = FALSE) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(drop_list)) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] return train, test
def _get_projection(n_samples, n_features, density="auto", eps=0.1): p = SparseRandomProjection() mat = lil_matrix((n_samples, n_features)) return p.fit(mat)
# Perform Randomized Principal Components Analysis (PCA) from sklearn.decomposition import RandomizedPCA as RPCA rpca = RPCA(n_components=num_components) rpca_transformed_data_train = rpca.fit_transform(dense_trainData) rpca_transformed_data_valid = rpca.transform(dense_validData) # Perform Gaussian Random Projection from sklearn.random_projection import GaussianRandomProjection as GaussRan grp = GaussRan(n_components=num_components) grp_transformed_data_train = grp.fit_transform(dense_trainData) grp_transformed_data_valid = grp.transform(dense_validData) # Perform Sparse Random Projection from sklearn.random_projection import SparseRandomProjection as SparseRan srp = SparseRan(n_components=num_components, random_state=0) srp_transformed_data_train = srp.fit_transform(dense_trainData) srp_transformed_data_valid = srp.transform(dense_validData) # Perform classification using 1-Nearest Neighbor Classifier from sklearn.neighbors import KNeighborsClassifier # Create a subset grid to plot performance against numbers of components tsvd_max = tsvd_transformed_data_train.shape[1] plot_subset = [] length_of_plot_subset = len(plot_subset) if tsvd_max < 101: spacing = super_fine_spacing plot_subset = [] for j in arange(1, spacing - 1): plot_subset.append(j)
import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import KMeans breast_cancer = pd.read_csv("./breast-cancer-wisconsin.csv") li = list(breast_cancer) breast_cancer = pd.DataFrame(breast_cancer.values, columns=li) Class = li[-1] arr = breast_cancer.values y = arr[:, -1] X = arr[:, 0:-1] clusters = range(2, 15) sp = SparseRandomProjection(n_components=4) output = sp.fit_transform(X) tester = em.ExpectationMaximizationTestCluster(output, y, clusters=range(2, 15), plot=False, stats=True) silhouette_EM, vmeasure_scores = tester.run() tester = kmtc.KMeansTestCluster(output, y, clusters=range(2, 15), plot=False, stats=True) silhouette_kmeans, V_measure = tester.run()
#tmp.to_csv(out+'diamonds scree3.csv') #tmp = defaultdict(dict) #for i,dim in product(range(10),dims1): # rp = GaussianRandomProjection(random_state=i, n_components=dim) # rp.fit(diamondsX) # tmp[dim][i] = reconstructionError(rp, diamondsX) # print (dim, "scree4") #tmp =pd.DataFrame(tmp).T #tmp.to_csv(out+'diamonds scree4.csv') #%% task 2 tmp = defaultdict(dict) for i, dim in product(range(10), dims1): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr_chunked(rp.fit_transform(diamondsX), diamondsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'diamonds scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'digits scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims1): rp = SparseRandomProjection(random_state=i, n_components=dim)
def select_features_SparseRandomProjections(train_X, train_y, test_X, k): selector = SparseRandomProjection(n_components=k, random_state=42) selector.fit(train_X) train_X = selector.transform(train_X) test_X = selector.transform(test_X) return train_X, test_X
# In[ ]: def distance_correlation(X1, X2): assert X1.shape[0] == X2.shape[0] return np.corrcoef( pairwise_distances(X1).ravel(), pairwise_distances(X2).ravel())[0, 1] # In[ ]: tmp = defaultdict(dict) for i, dim in product(range(10), dimensions): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = distance_correlation(rp.fit_transform(X_train), X_train) tmp = pd.DataFrame(tmp).T tmp.to_csv('./P2/IncomeRP_DistanceCorrelation.csv') # In[ ]: # Run Neural Networks rp = SparseRandomProjection(random_state=5) nn_results, clf = run_ann(dimensions, rp, X_train, Y_train) nn_results.to_csv('./P2/IncomeRP_ANN.csv') ## test score test_score = clf.score(X_test, Y_test) print("Test Accuracy = ", test_score)
X_path = '/cs/puls/Experiments/hxiao-test/feature-data.mat' Y_path = '/cs/puls/Experiments/hxiao-test/label-data.mat' X = loadmat(X_path)['featureData'] y = loadmat(Y_path)['labelData'] RANDOM_PROJECTION_FLAG = True if RANDOM_PROJECTION_FLAG: from sklearn.random_projection import SparseRandomProjection print "Applying random projection to reduce dimension" print "Shape before: %r" % (X.shape, ) transformer = SparseRandomProjection() X = transformer.fit_transform(X) print "Shape after: %r" % (X.shape, ) # sample subset of all the data rng = np.random.RandomState(0) sample_n = 10000 rows = rng.permutation(X.shape[0])[:sample_n] X = X[rows, :] y = y[rows, :] # sample train and test train_ratio = 0.8 train_n = int(sample_n*train_ratio)
df_non_obj_feats['binDec'] = int10 all_data_proc = pd.concat((df_obj_feats_freq, df_non_obj_feats), axis=1) #%% from sklearn.decomposition import PCA, FastICA from sklearn.random_projection import GaussianRandomProjection from sklearn.random_projection import SparseRandomProjection n_comp = 12 # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results = grp.fit_transform(all_data_proc) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results = srp.fit_transform(all_data_proc) # PCA pca = PCA(n_components=n_comp, random_state=420) pca_results = pca.fit_transform(all_data_proc) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica_results = ica.fit_transform(all_data_proc) for i in range(1, n_comp+1): all_data_proc['pca_' + str(i)] = pca_results[:,i-1] all_data_proc['ica_' + str(i)] = ica_results[:, i-1] all_data_proc['grp_' + str(i)] = grp_results[:,i-1] all_data_proc['srp_' + str(i)] = srp_results[:, i-1]
# In[40]: X_agg.head() # In[41]: from sklearn.decomposition import FactorAnalysis fa = FactorAnalysis(n_components=50, random_state=42) X_fa = fa.fit_transform(X) # In[42]: from sklearn.random_projection import SparseRandomProjection srp = SparseRandomProjection(n_components=50, random_state=42) X_srp = srp.fit_transform(X) # In[43]: from sklearn.random_projection import GaussianRandomProjection grp = GaussianRandomProjection(n_components=50, random_state=42, eps=0.1) X_grp = grp.fit_transform(X) # In[60]: from sklearn.decomposition import PCA pca = PCA(n_components=100, random_state=42) X_pca = pca.fit_transform(X)
class assignment4: def __init__(self): # data processing self.dataSetPath = './data_set/' self.dataSetName = "" self.csv_delimiter = ',' self.data = None self.allFeatures = [] self.allTarget = [] # not used self.XTrain = None self.XTest = None self.YTrain = None self.YTest = None # k-mean clustering self.kNum = range(1, 21) self.kmean = None self.kmeanRD = None # expectation maximization self.em = None self.emRD = None # PCA self.pca = None self.pcaDims = range(1, 21) # ICA self.icaDims = range(1, 21) self.ica = None # RP self.rp = None self.rpDims = range(1, 21) # TSVD self.tsvd = None self.tsvdDims = range(1, 10) def read_data_voice(self, dataName): with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=self.csv_delimiter) self.data = list(reader) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.data))) print('Number of attributes: {}'.format(len(self.data[0]) - 1)) def read_data_haptX(self, dataName): self.data = None with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=',') self.data = list(reader) print(len(self.data)) for elim in self.data: feature = [] for i in elim: feature.append(i) self.allFeatures.append(feature) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.allFeatures))) print('Number of attributes: {}'.format(len(self.allFeatures[0]))) def read_data_haptY(self, dataName): self.data = None with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=',') self.data = list(reader) for elim in self.data: self.allTarget.append(elim) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.allTarget))) print('Number of attributes: {}'.format(len(self.allTarget[0]))) self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32) self.allTarget = np.asarray(self.allTarget, dtype=np.float32) self.allTarget = self.allTarget.ravel() def split_data_to_train_test(self, testSize=0.3): # in case the data set are very different in format sample_len = len(self.data[0]) for elem in self.data: feature = elem[0:sample_len - 1] feature_vector = [] for f in feature: feature_vector.append(float(f)) self.allFeatures.append(feature_vector) if elem[-1] == '0': val = 0 else: val = 1 self.allTarget.append((float(val))) self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32) self.allTarget = np.asarray(self.allTarget, dtype=np.float32) self.XTrain, self.XTest, self.YTrain, self.YTest = train_test_split( self.allFeatures, self.allTarget, test_size=testSize, random_state=42) print( 'Total X train data -> {}%'.format( int((len(self.XTrain) / len(self.data)) * 100)), 'Size:', len(self.XTrain)) print( 'Total X test data -> {}%'.format( int((len(self.XTest) / len(self.data)) * 100)), 'Size:', len(self.XTest)) print( 'Total Y train data -> {}%'.format( int((len(self.YTrain) / len(self.data)) * 100)), 'Size:', len(self.YTrain)) print( 'Total Y test data -> {}%'.format( int((len(self.YTest) / len(self.data)) * 100)), 'Size', len(self.YTest)) def get_max_idx(self, input): maxVal = input[0] maxIdx = 0 for i in range(1, len(input)): if input[i] > maxVal: maxIdx = i maxVal = input[i] return maxIdx def pairwiseDistCorr(self, X1, X2): assert X1.shape[0] == X2.shape[0] d1 = pairwise_distances(X1) d2 = pairwise_distances(X2) return np.corrcoef(d1.ravel(), d2.ravel())[0, 1] def k_mean_cluster(self): print("-" * 50) print('{}: K-mean clustering'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) scores = [] confusionMatrix = [] self.kmean = KMeans(random_state=5, max_iter=1000) for i in self.kNum: self.kmean.set_params(n_clusters=i) self.kmean.fit(dataX) scores.append(sm.accuracy_score(self.allTarget, self.kmean.labels_)) confusionMatrix.append( sm.confusion_matrix(self.allTarget, self.kmean.labels_)) bestScoreIdx = self.get_max_idx(scores) print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx])) print("Confusion Matrix:", confusionMatrix[bestScoreIdx]) plt.figure() plt.ylabel('Accuracy') plt.xlabel('# of Clusters') plt.title('K-mean Cluster ({})'.format(self.dataSetName)) plt.style.context('seaborn-whitegrid') plt.xticks(self.kNum) plt.plot(self.kNum, scores) plt.grid() plt.draw() plt.savefig('./{}_KMEAN.png'.format(self.dataSetName)) print("-" * 50) def k_mean_cluster_reduced(self, n_clusters, reduced_data, name): print("-" * 50) print('{}: K-mean clustering {}'.format(self.dataSetName, name)) dataX = StandardScaler().fit_transform(self.allFeatures) self.kmeanRD = KMeans(n_clusters=n_clusters, random_state=5, max_iter=1000) self.kmeanRD.fit(reduced_data) print("Accuracy score:{0:.2f}".format( sm.accuracy_score(self.allTarget, self.kmeanRD.labels_))) print("Confusion Matrix:") print(sm.confusion_matrix(self.allTarget, self.kmeanRD.labels_)) print("-" * 50) def expectation_maximization_reduced(self, n_components, reduced_data, name): print("-" * 50) print('{}: Expectation maximization {}'.format(self.dataSetName, name)) self.emRD = GaussianMixture(n_components=n_components, random_state=5) self.emRD.fit(reduced_data) y_predict = self.emRD.predict(reduced_data) print("Accuracy score:{0:.2f}".format( sm.accuracy_score(self.allTarget, y_predict))) print("Confusion Matrix:") print(sm.confusion_matrix(self.allTarget, y_predict)) print("-" * 50) def expectation_maximization(self): print("-" * 50) print('{}: Expectation maximization'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) scores = [] confusionMatrix = [] self.em = GaussianMixture(random_state=5) for i in self.kNum: self.em.set_params(n_components=i) self.em.fit(dataX) y_predict = self.em.predict(dataX) scores.append(sm.accuracy_score(self.allTarget, y_predict)) confusionMatrix.append( sm.confusion_matrix(self.allTarget, y_predict)) bestScoreIdx = self.get_max_idx(scores) print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx])) print("Confusion Matrix:") print(confusionMatrix[bestScoreIdx]) plt.figure() plt.ylabel('Accuracy') plt.xlabel('# of Clusters') plt.title('Expectation Maximum Cluster ({})'.format(self.dataSetName)) plt.style.context('seaborn-whitegrid') plt.xticks(self.kNum) plt.plot(self.kNum, scores) plt.grid() plt.draw() plt.savefig('./{}_EM.png'.format(self.dataSetName)) print("-" * 50) def PCA(self): print("-" * 50) print('{}: Principal component analysis '.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.pca = PCA(random_state=5) grid = {'pca__n_components': self.pcaDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('pca', self.pca), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number PCA components:", search.best_params_) self.pca.fit(dataX) var = np.cumsum( np.round(self.pca.explained_variance_ratio_, decimals=3) * 100) plt.figure() plt.ylabel('% Variance Explained') plt.xlabel('# of Features') plt.title('PCA Analysis ({})'.format(self.dataSetName)) plt.xticks(self.pcaDims) plt.style.context('seaborn-whitegrid') plt.plot(var) plt.grid() plt.draw() plt.savefig('./{}_PCA_VA.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('PCA Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.pcaDims) plt.ylim([0, 1]) plt.style.context('seaborn-whitegrid') plt.plot(self.pcaDims, search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_PCA_GS.png'.format(self.dataSetName)) print("-" * 50) def ICA(self): print("-" * 50) print('{}: Independent component analysis '.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.ica = FastICA(random_state=5, max_iter=6000) # kurtosis kurt = [] for dim in self.icaDims: self.ica.set_params(n_components=dim) tmp = self.ica.fit_transform(dataX) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt.append(tmp.abs().mean()) # grid search grid = {'ica__n_components': self.icaDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('ica', self.ica), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number ICA components:", search.best_params_) plt.figure() plt.ylabel('Kurtosis') plt.xlabel('# of Features') plt.title('ICA Analysis ({})'.format(self.dataSetName)) plt.xticks(self.icaDims) plt.style.context('seaborn-whitegrid') plt.plot(kurt) plt.grid() plt.draw() plt.savefig('./{}_kurtosis.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('ICA Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.icaDims) plt.style.context('seaborn-whitegrid') plt.plot(self.icaDims, search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_ICA_GS.png'.format(self.dataSetName)) print("-" * 50) def RP(self): print("-" * 50) print('{}: Random Projection'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) disCorr = [] self.rp = SparseRandomProjection(random_state=5) for dim in self.rpDims: self.rp.set_params(n_components=dim) disCorr.append( self.pairwiseDistCorr(self.rp.fit_transform(dataX), dataX)) print(disCorr) # grid search grid = {'rp__n_components': self.rpDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('rp', self.rp), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number RP components:", search.best_params_) plt.figure() plt.ylabel('Distance') plt.xlabel('# of Features') plt.title('RP Analysis ({})'.format(self.dataSetName)) plt.xticks(self.rpDims) plt.style.context('seaborn-whitegrid') plt.plot(disCorr) plt.grid() plt.draw() plt.savefig('./{}_distance.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('RP Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.rpDims) plt.style.context('seaborn-whitegrid') plt.plot(search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_RP_GS.png'.format(self.dataSetName)) print("-" * 50) def TSVD(self): print("-" * 50) print('{}: TruncatedSVD'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.tsvd = TruncatedSVD(random_state=5) # grid search grid = {'tsvd__n_components': self.tsvdDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('tsvd', self.tsvd), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number TSVD components:", search.best_params_) self.tsvd.fit(dataX) var = np.cumsum( np.round(self.tsvd.explained_variance_ratio_, decimals=3) * 100) plt.figure() plt.ylabel('% Variance Explained') plt.xlabel('# of Features') plt.title('TSVD Analysis ({})'.format(self.dataSetName)) plt.xticks(self.tsvdDims) plt.style.context('seaborn-whitegrid') plt.plot(var) plt.grid() plt.draw() plt.savefig('./{}_TSD_VA.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('TSVD Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.tsvdDims) plt.style.context('seaborn-whitegrid') plt.plot(search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_TSVD_GS.png'.format(self.dataSetName)) print("-" * 50)
def create_rca(k, r_state): return SparseRandomProjection(n_components=k, random_state=r_state)
from sklearn.neural_network import MLPClassifier from dataTransformer import * from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from data import MNIST from sklearn.metrics import accuracy_score from time import time from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection if __name__=="__main__": mnist = MNIST(10000) start = time() pipeline = Pipeline([('Scale', StandardScaler()), ('PCA', SparseRandomProjection(random_state=0, n_components=160)), ('MLP', MLPClassifier(hidden_layer_sizes=(512, 256), alpha=0.01, verbose=1))]) pipeline.fit(mnist.X_train, mnist.y_train) y_pred = pipeline.predict(mnist.X_test) end = time() print ("time used: {}s".format(end - start)) print (accuracy_score(y_pred, mnist.y_test)) # MLPClassifier(hidden_layer_sizes=(512, 256), alpha=0.01)
tmp1_ =pd.DataFrame(tmp).T tmp1_.to_csv('Diamond_RF_error.csv') tmp = defaultdict(dict) dims = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23] for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(X_train2) tmp[dim][i] = reconstructionError(rp, X_test2) tmp2_ =pd.DataFrame(tmp).T tmp2_.to_csv('CreditCard_RF_error.csv') ''' #3.RP transformation #3.1Diamond data dim = 9 rp = SparseRandomProjection(n_components=dim, random_state=6) #3.1.1 Training data DiamondX2_train = rp.fit_transform(X_train) Diamond2_train = pd.DataFrame( np.hstack((DiamondX2_train, np.atleast_2d(Y_train).T))) cols1 = list(range(Diamond2_train.shape[1])) cols1[-1] = 'Class' Diamond2_train.columns = cols1 Diamond2_train.to_csv('Diamond_RP_train.csv') #3.1.2 test data DiamondX2_test = rp.fit_transform(X_test) Diamond2_test = pd.DataFrame( np.hstack((DiamondX2_test, np.atleast_2d(Y_test).T))) cols2 = list(range(Diamond2_test.shape[1]))
print("ICA") ica = FastICA(n_components=N_COMP, random_state=random_state) ica_results_train = ica.fit_transform(train[flist]) ica_results_test = ica.transform(test[flist]) print("GRP") grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=random_state) grp_results_train = grp.fit_transform(train[flist]) grp_results_test = grp.transform(test[flist]) print("SRP") srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=random_state) srp_results_train = srp.fit_transform(train[flist]) srp_results_test = srp.transform(test[flist]) print("Append decomposition components to datasets...") for i in range(1, N_COMP + 1): train['pca_' + str(i)] = pca_results_train[:, i - 1] test['pca_' + str(i)] = pca_results_test[:, i - 1] train['ica_' + str(i)] = ica_results_train[:, i - 1] test['ica_' + str(i)] = ica_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
random_state=random_state)) # Reduce dimension to 2 with LinearDiscriminantAnalysis lda = make_pipeline(StandardScaler(), LinearDiscriminantAnalysis(n_components=2)) # Reduce dimension to 2 with NeighborhoodComponentAnalysis nca = make_pipeline(StandardScaler(), NeighborhoodComponentsAnalysis(n_components=2, random_state=random_state)) # Reduce dimension to 2 with Sparse Random Projection [SRP] SRP = make_pipeline(StandardScaler(), SparseRandomProjection(n_components=2, density = 'auto', eps = 0.5, random_state=random_state, dense_output = False)) # Reduce dimension to 2 with MultiDimensional Scaling [MDS] mds = make_pipeline(StandardScaler(), MDS(n_components=2, n_init=12, max_iter=1200, metric=True, n_jobs=4, random_state=random_state)) # Reduce dimension to 2 with Isomap isomap = make_pipeline(StandardScaler(), Isomap(n_components=2,
def use_decomposed_features_as_new_df(train, test, total, n_components, use_pca=False, use_tsvd=False, use_ica=False, use_fa=False, use_grp=False, use_srp=False): N_COMP = n_components ntrain = len(train) print("\nStart decomposition process...") if use_pca: print("PCA") pca = PCA(n_components=N_COMP, random_state=42) pca_results = pca.fit_transform(total) pca_results_train = pca_results[:ntrain] pca_results_test = pca_results[ntrain:] if use_tsvd: print("tSVD") tsvd = TruncatedSVD(n_components=N_COMP, random_state=42) tsvd_results = tsvd.fit_transform(total) tsvd_results_train = tsvd_results[:ntrain] tsvd_results_test = tsvd_results[ntrain:] if use_ica: print("ICA") ica = FastICA(n_components=N_COMP, random_state=42) ica_results = ica.fit_transform(total) ica_results_train = ica_results[:ntrain] ica_results_test = ica_results[ntrain:] if use_fa: print("FA") fa = FactorAnalysis(n_components=N_COMP, random_state=42) fa_results = fa.fit_transform(total) fa_results_train = fa_results[:ntrain] fa_results_test = fa_results[ntrain:] if use_grp: print("GRP") grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=42) grp_results = grp.fit_transform(total) grp_results_train = grp_results[:ntrain] grp_results_test = grp_results[ntrain:] if use_srp: print("SRP") srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=42) srp_results = srp.fit_transform(total) srp_results_train = srp_results[:ntrain] srp_results_test = srp_results[ntrain:] print("Append decomposition components together...") train_decomposed = np.concatenate([ srp_results_train, grp_results_train, ica_results_train, pca_results_train, tsvd_results_train ], axis=1) test_decomposed = np.concatenate([ srp_results_test, grp_results_test, ica_results_test, pca_results_test, tsvd_results_test ], axis=1) train_with_only_decomposed_features = pd.DataFrame(train_decomposed) test_with_only_decomposed_features = pd.DataFrame(test_decomposed) for agg_col in [ 'sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min' ]: train_with_only_decomposed_features[col] = train[col] test_with_only_decomposed_features[col] = test[col] # Remove any NA train_with_only_decomposed_features = train_with_only_decomposed_features.fillna( 0) test_with_only_decomposed_features = test_with_only_decomposed_features.fillna( 0) return train_with_only_decomposed_features, test_with_only_decomposed_features
for the evalutation of LDA as dimensionality reduction and SVM as classifier""" # Author: Ingo Guehring # import numpy as np # from sklearn.decomposition import TruncatedSVD from sklearn.random_projection import SparseRandomProjection from sklearn.svm import SVC from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.multiclass import OneVsRestClassifier import evaluation.shared as shared import model # pre_reduction = TruncatedSVD(n_components=500) PRE_REDUCTION = SparseRandomProjection(n_components=500) CLASSIFIER = OneVsRestClassifier(SVC(probability=True)) # grid N_COMPONENTS_RANGE = [1, 2, 4, 6, 8, 10, 12, 13] # kernels = ['linear', 'rbf'] # old range, that turned out to be too small # GAMMA_RANGE = np.logspace(-3, 3, 7) # C_RANGE = np.logspace(-3, 3, 7) # new wider range C_RANGE = shared.C_RANGE GAMMA_RANGE = shared.GAMMA_RANGE # this could also be used: classifier_kernel=kernels,
# # 4. Decomposition Feature # So far I've only looked at PCA components, but most kernels look at several decomposition methods, so it may be interesting to look at t-SNE of these 10-50 components of each method instead of 1000 PCA components. Furthermore, it's interesting to see how well we can classify test/train based on this reduced feature space. # # # In[ ]: COMPONENTS = 20 # List of decomposition methods to use methods = [ TruncatedSVD(n_components=COMPONENTS), PCA(n_components=COMPONENTS), FastICA(n_components=COMPONENTS), GaussianRandomProjection(n_components=COMPONENTS, eps=0.1), SparseRandomProjection(n_components=COMPONENTS, dense_output=True) ] # Run all the methods embeddings = [] for method in methods: name = method.__class__.__name__ embeddings.append( pd.DataFrame(method.fit_transform(total_df), columns=[f"{name}_{i}" for i in range(COMPONENTS)])) print(f">> Ran {name}") # Put all components into one dataframe components_df = pd.concat(embeddings, axis=1) # Prepare plot
pca2_results_train = pca.fit_transform(X_train) pca2_results_test = pca.transform(X_test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(X_train) ica2_results_test = ica.transform(X_test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(X_train) grp_results_test = grp.transform(X_test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(X_train) srp_results_test = srp.transform(X_test) # create empty dataframes to capture extra features extra_features_train = pd.DataFrame() extra_features_test = pd.DataFrame() # Append decomposition components to datasets for i in range(1, n_comp + 1): extra_features_train['pca_' + str(i)] = pca2_results_train[:, i - 1] extra_features_test['pca_' + str(i)] = pca2_results_test[:, i - 1] extra_features_train['ica_' + str(i)] = ica2_results_train[:, i - 1] extra_features_test['ica_' + str(i)] = ica2_results_test[:, i - 1]
data = load_digits().data[:500] n_samples, n_features = data.shape print("Embedding %d samples with dim %d using various random projections" % (n_samples, n_features)) n_components_range = np.array([300, 1000, 10000]) dists = euclidean_distances(data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range: t0 = time() rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(data) print("Projected %d samples from %d to %d in %0.3fs" % (n_samples, n_features, n_components, time() - t0)) if hasattr(rp, 'components_'): n_bytes = rp.components_.data.nbytes n_bytes += rp.components_.indices.nbytes print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6)) projected_dists = euclidean_distances(projected_data, squared=True).ravel()[nonzero] plt.figure() plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu) plt.xlabel("Pairwise squared distances in original space") plt.ylabel("Pairwise squared distances in projected space")
def build_impl(self): self.model = SparseRandomProjection(**self.config)
def optimize_embedding(data_matrix, known_targets=None, min_feature_ratio=.1, n_iter=30, n_repetitions=1): # case for sparse data matrix: use random projection to transform to dense if sp.issparse(data_matrix): logger.info('Convert sparse to dense') logger.info('Data matrix: %d rows %d cols' % (data_matrix.shape[0], data_matrix.shape[1])) from sklearn.random_projection import SparseRandomProjection data_matrix = SparseRandomProjection().fit_transform( data_matrix).toarray() logger.info('Data matrix: %d rows %d cols' % (data_matrix.shape[0], data_matrix.shape[1])) if known_targets is not None: logger.info('Feature selection') logger.info('Data matrix: %d rows %d cols' % (data_matrix.shape[0], data_matrix.shape[1])) new_data_matrix = iterated_semi_supervised_feature_selection( data_matrix, known_targets, min_feature_ratio=min_feature_ratio) if new_data_matrix.shape[1] > 2: data_matrix = new_data_matrix logger.info('Data matrix: %d rows %d cols' % (data_matrix.shape[0], data_matrix.shape[1])) n_instances = data_matrix.shape[0] opts_list = make_opts_list(n_instances, n_iter) # iterate n_iter times to find best parameter configuration best_score = 0 logger.debug('neqs = neighborhood embedding quality score') for i in range(n_iter): random.seed(i) # sample from the options embed_opts = make_embed_opts(opts_list, n_instances) basis_opts = make_basis_opts(opts_list, n_instances) general_opts = make_general_opts() try: # find options with max quality score score_list = [] for it in range(n_repetitions): data_matrix_lowdim,\ link_ids,\ score,\ scores = embed_(data_matrix, embed_opts=embed_opts, basis_opts=basis_opts, change_of_basis=general_opts['change_of_basis']) score_list.append(score) mean_reduced_score = np.mean(score_list) - np.std(score_list) if best_score == 0 or mean_reduced_score > best_score: # best_embed_opts = embed_opts # best_basis_opts = basis_opts # best_change_of_basis = change_of_basis best_data_matrix_lowdim = data_matrix_lowdim best_link_ids = link_ids best_scores = scores best_score = mean_reduced_score mark = '*' else: mark = '' logger.debug('..%.2d/%d neqs: %.3f (%.3f +- %.3f) %s' % (i + 1, n_iter, mean_reduced_score, np.mean(scores), np.std(scores), mark)) except Exception as e: logger.debug('Failed iteration: %s' % e) return best_data_matrix_lowdim, best_link_ids, best_score, best_scores
from numpy.lib.function_base import _interp_dispatcher # from skmultiflow.trees import HoeffdingTree as HT from skmultiflow.lazy import SAMKNN from sklearn.metrics import accuracy_score import time, copy from sklearn.random_projection import SparseRandomProjection from sklearn.metrics import cohen_kappa_score # from skmultiflow.bayes import NaiveBayes from inc_pca import IncPCA from rff_base import Base as RFF from rrslvq import ReactiveRobustSoftLearningVectorQuantization as RRSLVQ from rslvq import RSLVQ from skmultiflow.meta import AdaptiveRandomForest as ARF transformer = SparseRandomProjection(n_components=1000) classes = np.arange(0, 15, 1) res_file = 'res_pca_skipgram.txt' f = open(res_file, 'a+') f.write('SKIP-GRAM\n') f.close() data = np.load('../dataset/skip-gram-embed-w-label.npy') # f = open('data/nasdaq_stream_wo_sentiment.csv') # labels = [] # while 1: # line = f.readline() # if line == '': break # arr = np.array(line.split(','), dtype='float64') # labels.append(arr[1])
class STPM(pl.LightningModule): def __init__(self, model: torchvision.models, embedding_dir_path: str, sample_path: str, input_image_size: int, coreset_sampling_ratio: int, n_neighbors: int, anomal_threshold: float, normalization_mean: [], normalization_std: []): super(STPM, self).__init__() self.save_hyperparameters() self.init_features() # MODEL HYPERPARAMETERS self.input_image_size = input_image_size self.coreset_sampling_ratio = coreset_sampling_ratio self.n_neighbors = n_neighbors self.anomal_threshold = anomal_threshold self.embedding_dir_path = embedding_dir_path self.sample_path = sample_path #self.source_code_save_path = source_code_save_path def hook_t(module, input, output): self.features.append(output) self.model = model #self.model = wide_resnet50_2(pretrained=True, progress=True) for param in self.model.parameters(): param.requires_grad = False self.model.layer2[-1].register_forward_hook(hook_t) self.model.layer3[-1].register_forward_hook(hook_t) #self.data_inv_transform= transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.255], std=[1/0.229, 1/0.224, 1/0.255]) self.data_inv_transform = transforms.Normalize( mean=[ -normalization_mean[0] / normalization_std[0], -normalization_mean[1] / normalization_std[1], -normalization_mean[2] / normalization_std[2] ], std=[ 1 / normalization_std[0], 1 / normalization_std[1], 1 / normalization_std[2] ]) # dummy loss. No Update parameters is performed self.criterion = torch.nn.MSELoss(reduction='sum') self.init_results_list() def init_results_list(self): self.img_path_list = [] self.mean_score_norm = [] self.all_scores = [] self.all_scores_mean_norm = [] self.image_batch_list = [] self.x_type_list = [] self.y_true = [] def init_features(self): self.features = [] def forward(self, x_t): self.init_features() _ = self.model(x_t) return self.features def save_anomaly_map(self, anomaly_map, input_img, gt_img, file_name, x_type): if anomaly_map.shape != input_img.shape: anomaly_map = cv2.resize(anomaly_map, (input_img.shape[0], input_img.shape[1])) anomaly_map_norm = min_max_norm(anomaly_map) anomaly_map_norm_hm = cvt2heatmap(anomaly_map_norm * 255) # anomaly map on image heatmap = cvt2heatmap(anomaly_map_norm * 255) hm_on_img = heatmap_on_image(heatmap, input_img) # save images cv2.imwrite( os.path.join(self.sample_path, f'{x_type}_{file_name}.jpg'), input_img) cv2.imwrite( os.path.join(self.sample_path, f'{x_type}_{file_name}_amap.jpg'), anomaly_map_norm_hm) cv2.imwrite( os.path.join(self.sample_path, f'{x_type}_{file_name}_amap_on_img.jpg'), hm_on_img) def configure_optimizers(self): return None def on_train_start(self): self.model.eval() # to stop running_var move (maybe not critical) self.embedding_list = [] def on_test_start(self): self.init_results_list() self.embedding_coreset = pickle.load( open(os.path.join(self.embedding_dir_path, 'embedding.pickle'), 'rb')) embeded = torch.tensor(self.embedding_coreset) train_jit = TrainFeature(embeded) traced_model = torch.jit.script(train_jit) torch.jit.save(traced_model, "patchcore_features.pt") def training_step(self, batch, batch_idx): # save locally aware patch features x, _, file_name, _ = batch features = self(x) embeddings = [] for feature in features: m = torch.nn.AvgPool2d(3, 1, 1) embeddings.append(m(feature)) embedding = embedding_concat(embeddings[0], embeddings[1]) self.embedding_list.extend(reshape_embedding(np.array(embedding))) gc.collect() def training_epoch_end(self, outputs): total_embeddings = np.array(self.embedding_list) # Random projection self.randomprojector = SparseRandomProjection( n_components='auto', eps=0.9) # 'auto' => Johnson-Lindenstrauss lemma self.randomprojector.fit(total_embeddings) # Coreset Subsampling selector = kCenterGreedy(total_embeddings, 0, 0) selected_idx = selector.select_batch( model=self.randomprojector, already_selected=[], N=int(total_embeddings.shape[0] * float(self.coreset_sampling_ratio))) self.embedding_coreset = total_embeddings[selected_idx] print('initial embedding size : ', total_embeddings.shape) print('final embedding size : ', self.embedding_coreset.shape) with open(os.path.join(self.embedding_dir_path, 'embedding.pickle'), 'wb') as f: pickle.dump(self.embedding_coreset, f) gc.collect() def test_step(self, batch, batch_idx): # Nearest Neighbour Search x, label, file_name, x_type = batch features = self(x) embeddings = [] for feature in features: m = torch.nn.AvgPool2d(3, 1, 1) embeddings.append(m(feature)) embedding_ = embedding_concat(embeddings[0], embeddings[1]) embedding_test = np.array(reshape_embedding(np.array(embedding_))) # NN knn = KNN(torch.from_numpy(self.embedding_coreset).cuda(), k=self.n_neighbors) score_patches = knn( torch.from_numpy(embedding_test).cuda())[0].cpu().detach().numpy() self.img_path_list.extend(file_name) # support multi input size block_size = int(np.sqrt(len(score_patches))) anomaly_map = score_patches[:, 0].reshape((block_size, block_size)) self.all_scores.append(anomaly_map) self.image_batch_list.append(x) self.x_type_list.append(x_type) self.y_true.append(label.cpu().numpy()[0]) def Find_Optimal_Cutoff(self, target, predicted): fpr, tpr, threshold = roc_curve(target, predicted, pos_label=1) i = np.arange(len(tpr)) roc = pd.DataFrame({ 'tf': pd.Series(tpr - (1 - fpr), index=i), 'threshold': pd.Series(threshold, index=i) }) roc_t = roc.iloc[(roc.tf - 0).abs().argsort()[:1]] return list(roc_t['threshold']), threshold ''' plt.plot(fpr, tpr) plt.plot([0, 1], [0, 1], '--', color='black') plt.title('ROC Curve') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.show() ''' def analyze_data(self): score_pathces = np.array(self.all_scores) for i, val in enumerate(score_pathces): self.all_scores_mean_norm.append(np.mean(val)) min_score = np.min(score_pathces) max_score = np.max(score_pathces) print("MIN SCORE {}".format(min_score)) print("MAX SCORE {}".format(max_score)) scores = (score_pathces - min_score) / (max_score - min_score) for i, heatmap in enumerate(scores): anomaly_map_resized = cv2.resize( heatmap, (self.input_image_size, self.input_image_size)) max_ = np.max(heatmap) min_ = np.min(heatmap) anomaly_map_resized_blur = gaussian_filter(anomaly_map_resized, sigma=4) anomaly_map_resized_blur[0][0] = 1. # save images x = self.image_batch_list[i] x = self.data_inv_transform(x) input_x = cv2.cvtColor( x.permute(0, 2, 3, 1).cpu().numpy()[0] * 255, cv2.COLOR_BGR2RGB) if anomaly_map_resized_blur.shape != input_x.shape: anomaly_map_resized_blur = cv2.resize( anomaly_map_resized_blur, (input_x.shape[0], input_x.shape[1])) if self.anomal_threshold != 0: anomaly_threshold_index = anomaly_map_resized_blur[ anomaly_map_resized_blur > self.anomal_threshold] anomaly_map_resized_blur[ anomaly_map_resized_blur < self.anomal_threshold] = 0 anomaly_threshold_area = anomaly_threshold_index.size anomaly_threshold_area = anomaly_threshold_area / \ float(anomaly_map_resized_blur.size) * 100. self.all_scores_mean_norm[i] = anomaly_threshold_area # anomaly map on image heatmap = cvt2heatmap(anomaly_map_resized_blur * 255) hm_on_img = heatmap_on_image(heatmap, input_x) # save images cv2.imwrite( os.path.join( self.sample_path, f'{self.x_type_list[i]}_{self.img_path_list[i]}.jpg'), input_x) cv2.imwrite( os.path.join( self.sample_path, f'{self.x_type_list[i]}_{self.img_path_list[i]}_amap.jpg'), heatmap) cv2.imwrite( os.path.join( self.sample_path, f'{self.x_type_list[i]}_{self.img_path_list[i]}_amap_on_img.jpg' ), hm_on_img) def test_epoch_end(self, outputs): self.analyze_data() best_th, threshold = self.Find_Optimal_Cutoff( self.y_true, self.all_scores_mean_norm) print(f'\nbest threshold={best_th}') ng_index = np.where(np.array(self.y_true) == 1) if len(ng_index[0]) == 0: ng_index = len(self.y_true) else: ng_index = ng_index[0][0] fig = plt.figure() sns.histplot(self.all_scores_mean_norm[:ng_index], kde=True, color="blue", label="normal") sns.histplot(self.all_scores_mean_norm[ng_index:], kde=True, color="red", label="abnormal") fig.legend(labels=['normal', 'abnormal']) plt.xlabel("Anomaly score") plt.ylabel("Count") plt.savefig('Anomaly_score_histplot.jpg')
processor.latext_start_figure() X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans') n_clusters = len(dataset.label_encoder.classes_) pca = PCA(n_components=0.95) pca.fit(X_train) n_components = pca.components_.shape[0] print(f"n_components: {n_components}") dr_models = [ PCA(n_components=n_components, random_state=0), FastICA(n_components=n_components, random_state=0), MiniBatchDictionaryLearning(n_components=n_components, alpha=1, batch_size=200, n_iter=10, random_state=0), SparseRandomProjection(random_state=0, n_components=n_components) ] clustering_models = [ KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=600, random_state=0, tol=0.0001), GaussianMixture(n_components=n_clusters, n_init=10, max_iter=600, random_state=0, tol=0.0001) ] for pca in dr_models:
map_to_int = {name: n for n, name in enumerate(targets)} df_mod[target_column].replace(map_to_int, inplace=True) return (df_mod, map_to_int) if __name__ == "__main__": mushroom_data = pd.read_csv("mushroom_data.csv") dft, mapping = encode_target(mushroom_data, "class") dft.to_csv('mushroom_datanew.cvs') X = (dft.ix[:,:-1]) y = dft.ix[:, -1] #randomized projection tmp = defaultdict(dict) dims = range(1, 22) for i, dim in product(range(20), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X) tmp = pd.DataFrame(tmp).T tmp tmp.to_csv('rp_mushroom_iterations.csv') tmp_fit = defaultdict(dict) for i,dim in product(range(20),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(X) tmp_fit[dim][i] = reconstructionError(rp, X) tmp_fit =pd.DataFrame(tmp_fit).T tmp_fit tmp_fit.to_csv('rp_mushroom_new_data.csv') grid ={'rp__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch}
##NEURAL NETWORK from sklearn.neural_network import MLPClassifier #LEARNIGN CURVE PLOT from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100) ###PCA ##X_train = PCA(n_components=3).fit_transform(X_train) ##X_test = PCA(n_components=3).fit_transform(X_test) ####RP X_train = SparseRandomProjection(n_components=3).fit_transform(X_train) X_test = SparseRandomProjection(n_components=3).fit_transform(X_test) mlp = MLPClassifier(activation='logistic', solver='adam', max_iter=260) mlp.fit(X_train, y_train) nn_pred = mlp.predict(X_test) def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate a simple plot of the test and training learning curve. """ plt.figure() plt.title(title) if ylim is not None:
pca = PCA(n_components=n_comp, random_state=420) pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1)) pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) srp_results_test = srp.transform(test) # Append decomposition components to datasets for i in range(1, n_comp+1): train['pca_' + str(i)] = pca2_results_train[:,i-1] test['pca_' + str(i)] = pca2_results_test[:, i-1] train['ica_' + str(i)] = ica2_results_train[:,i-1] test['ica_' + str(i)] = ica2_results_test[:, i-1] train['tsvd_' + str(i)] = tsvd_results_train[:,i-1] test['tsvd_' + str(i)] = tsvd_results_test[:, i-1] train['grp_' + str(i)] = grp_results_train[:,i-1]
tsvd = TruncatedSVD(n_components=n_components, random_state=420) df_tsvd = pd.DataFrame(tsvd.fit_transform(train), columns=columns) df_test_tsvd = pd.DataFrame(tsvd.transform(test), columns=columns) # GRP columns = ['GRP_{}'.format(i) for i in range(n_components)] grp = GaussianRandomProjection(n_components=n_components, eps=0.1, random_state=420) df_grp = pd.DataFrame(grp.fit_transform(train), columns=columns) df_test_grp = pd.DataFrame(grp.transform(test), columns=columns) # SRP columns = ['SRP_{}'.format(i) for i in range(n_components)] srp = SparseRandomProjection(n_components=n_components, dense_output=True, random_state=420) df_srp = pd.DataFrame(srp.fit_transform(train), columns=columns) df_test_srp = pd.DataFrame(srp.transform(test), columns=columns) train = pd.concat([train, df_pca, df_ica, df_tsvd, df_grp, df_srp], axis=1) test = pd.concat( [test, df_test_pca, df_test_ica, df_test_tsvd, df_test_grp, df_test_srp], axis=1) ### FEATURE SELECTION ### # f_regression #f_sel = SelectKBest(score_func = f_regression, k = 'all') #train_red = pd.DataFrame(f_sel.fit_transform(train, y)) #f_scores = pd.Series(f_sel.scores_) #pvalues = pd.Series(f_sel.pvalues_)
data = load_digits().data[:500] n_samples, n_features = data.shape print("Embedding %d samples with dim %d using various random projections" % (n_samples, n_features)) n_components_range = np.array([300, 1000, 10000]) dists = euclidean_distances(data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range: t0 = time() rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(data) print("Projected %d samples from %d to %d in %0.3fs" % (n_samples, n_features, n_components, time() - t0)) if hasattr(rp, 'components_'): n_bytes = rp.components_.data.nbytes n_bytes += rp.components_.indices.nbytes print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6)) projected_dists = euclidean_distances( projected_data, squared=True).ravel()[nonzero] plt.figure() plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu) plt.xlabel("Pairwise squared distances in original space") plt.ylabel("Pairwise squared distances in projected space")
kurtosis = collections.defaultdict(list) for i in range(1, num_components + 1): kurtosis['num components'].append(i) ica = FastICA(n_components=i) ica_transformed_data = ica.fit_transform(X_default_train) kurtosis['avg kurtosis'].append( pd.DataFrame(data=ica_transformed_data).kurt(axis=0).abs().mean()) kurtosis_df = pd.DataFrame(data=kurtosis) kurtosis_df.to_csv('default_avg_kurtosis.csv') num_components = 16 rp_stats = collections.defaultdict(list) for i in range(1, num_components): rp_stats['num components'].append(i) rp = SparseRandomProjection(n_components=i) nnm = MLPClassifier() rp_nnm = Pipeline([('rp', rp), ('nnm', nnm)]) rp_nnm.fit(X_digits_train, y_digits_train) accuracy_score = metrics.accuracy_score(rp_nnm.predict(X_digits_test), y_digits_test) rp_stats['accuracy score'].append(accuracy_score) rp_df = pd.DataFrame(data=rp_stats) rp_df.to_csv('digits_rp_data.csv') num_components = 23 rp_stats = collections.defaultdict(list) for i in range(1, num_components): rp_stats['num components'].append(i) rp = SparseRandomProjection(n_components=i) nnm = MLPClassifier()
f = open('../data/article_text.p', 'wb') cPickle.dump(articles, f, protocol=-1) print "saving done" print len(articles) vec = TfidfVectorizer(max_df=0.8, sublinear_tf=True) X = vec.fit_transform(articles) print X.shape proj = SparseRandomProjection() X = proj.fit_transform(X) print X.shape sparse_save(X,"../data/tfidf.h5") # f = open('X_data.p', 'wb') # cPickle.dump(X.data, f, protocol=-1) # f = open('X_indices.p', 'wb') # cPickle.dump(X.indices, f, protocol=-1) # f = open('X_indptr.p', 'wb') # cPickle.dump(X.indptr, f, protocol=-1) #X = normalize(X)
# Part 2: perform sparse random projection of the faces dataset faces_data = fetch_olivetti_faces().data n_samples, n_features = faces_data.shape print "Embedding %d faces with dim %d using various random projections" % ( n_samples, n_features) n_components_range = np.array([50, 200, 1000]) dists = euclidean_distances(faces_data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range: rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(faces_data) projected_dists = euclidean_distances( projected_data, squared=True).ravel()[nonzero] pl.figure() pl.hexbin(dists, projected_dists, gridsize=100) pl.xlabel("Pairwise squared distances in original space") pl.ylabel("Pairwise squared distances in projected space") pl.title("Pairwise distances distribution for n_components=%d" % n_components) cb = pl.colorbar() cb.set_label('Sample pairs counts') rates = projected_dists / dists