def randomized_projection(X, y, dataset_name): rand = GaussianRandomProjection(n_components=2) X_transformed = rand.fit_transform(X) plt.figure() plt.title( '{} data after Gaussian Random Projection into 2 components'.format( dataset_name)) plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y) plt.show() rand = GaussianRandomProjection(n_components=3) X_transformed = rand.fit_transform(X) # Visualize transformed data plt.figure() fig = plt.figure(1, figsize=(4, 3)) plt.clf() ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) plt.cla() plt.title( '{} data after Gaussian Random Projection into 3 components'.format( dataset_name)) ax.scatter(X_transformed[:, 0], X_transformed[:, 1], X_transformed[:, 2], c=y) ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) plt.show()
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: eps = self.hyperparams['eps'] n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim, eps=eps) _logger.info("[INFO] n_components is " + str(n_components)) if n_components > self._y_dim: # Default n_components == 'auto' fails. Need to explicitly assign n_components self._model = GaussianRandomProjection( n_components=self._y_dim, random_state=self.random_seed) else: try: self._model = GaussianRandomProjection( eps=eps, random_state=self.random_seed) self._model.fit(self._training_data) except: _logger.info( "[Warning] Using given eps value failed, will use default conditions." ) self._model = GaussianRandomProjection() self._model.fit(self._training_data) self._fitted = True return CallResult(None, has_finished=True)
def test_output_transformer(): X, y = datasets.make_multilabel_classification(return_indicator=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # Check that random_state are different transformer = GaussianRandomProjection(n_components=5, random_state=None) for name, ForestEstimator in FOREST_ESTIMATORS.items(): est = ForestEstimator(random_state=5, output_transformer=transformer) est.fit(X_train, y_train) y_pred = est.predict(X_test) assert_equal(y_pred.shape, y_test.shape) random_state = [ sub.output_transformer_.random_state for sub in est.estimators_ ] assert_equal(len(set(random_state)), est.n_estimators) # Check that random_state are equals transformer = FixedStateTransformer( GaussianRandomProjection(n_components=5), random_seed=0) for name, ForestEstimator in FOREST_ESTIMATORS.items(): est = ForestEstimator(random_state=5, output_transformer=transformer) est.fit(X_train, y_train) y_pred = est.predict(X_test) assert_equal(y_pred.shape, y_test.shape) random_state = [ sub.output_transformer_.random_state for sub in est.estimators_ ] assert_equal(len(set(random_state)), 1) assert_equal(random_state[0], 0)
def test_fixed_state_transformer(): random_state = check_random_state(0) X = random_state.rand(500, 100) # Check that setting the random_seed is equivalent to set the # random_state transf = GaussianRandomProjection(n_components=5, random_state=0) fixed_transf = FixedStateTransformer( GaussianRandomProjection(n_components=5), random_seed=0) assert_array_almost_equal(fixed_transf.fit_transform(X), transf.fit_transform(X)) # Check that set_params doesn't modify the results fixed_transf = FixedStateTransformer( GaussianRandomProjection(n_components=5, random_state=None)) fixed_transf2 = FixedStateTransformer( GaussianRandomProjection(random_state=1, n_components=5)) assert_array_almost_equal(fixed_transf.fit_transform(X), fixed_transf2.fit_transform(X)) # Check that it work when there is no random_state fixed_transf = FixedStateTransformer(IdentityProjection()) assert_array_almost_equal(fixed_transf.fit_transform(X), X)
def rp_dim_red(x_train_scaled, dataset_name, features_num = 19): print(x_train_scaled.shape[1]) rp_feature_losses = [] rp_feature_stds = [] z=0 for k in range(1, x_train_scaled.shape[1]+1): losses = [] for m in range(5): rp = GaussianRandomProjection(k) rp_result = rp.fit_transform(x_train_scaled) # inverse_transform inv = np.linalg.pinv(rp.components_.T) x_projected_rp = rp_result.dot(inv) loss = ((x_train_scaled - x_projected_rp) ** 2).mean() losses.append(loss) rp_feature_stds.append(np.std(losses)) rp_feature_losses.append(np.mean(losses)) np_feature_losses_percent = np.multiply(100, rp_feature_losses/np.sum(rp_feature_losses)) print("std") print(rp_feature_stds) print('loss') print(rp_feature_losses) print('sum') print(np.sum(rp_feature_losses)) print('%') print(np_feature_losses_percent) print('num of clustrs < 10% loss') for i in range(len(np_feature_losses_percent)): z=z+np_feature_losses_percent[i] if z>90: print(i+1) break plt.bar(list(range(1,len(np_feature_losses_percent)+1)), np_feature_losses_percent) plt.title("Random Projection Losses % ("+str(dataset_name)+")") plt.ylabel("Mean Squared Error (% of Total)") plt.xlabel("Features") plt.savefig((str(dataset_name))+' rp analysis % loss.png') plt.show() plt.bar(list(range(1,len(rp_feature_losses)+1)), rp_feature_losses) plt.title("Random Projection Losses ("+str(dataset_name)+")") plt.ylabel("Mean Squared Error") plt.xlabel("Features") # plt.subplots_adjust(bottom=.15, left=.15) plt.savefig((str(dataset_name))+' rp analysis.png') plt.show() plt.bar(list(range(1,len(rp_feature_stds)+1)), rp_feature_stds) plt.title("Random Projection STDs ("+str(dataset_name)+")") plt.ylabel("STD") plt.xlabel("Features") plt.savefig((str(dataset_name))+' rp std analysis.png') plt.show() rp = GaussianRandomProjection(features_num,random_state=random_state) rp_result = rp.fit_transform(x_train_scaled) inv = np.linalg.pinv(rp.components_.T) x_projected_rp = rp_result.dot(inv) return rp_result, x_projected_rp
def run_rp(dataset_name, X, y, verbose=False): # attempt RP for various dimensionality levels n_components_vals = np.arange(1, len(X.columns)) iterations = np.arange(1, 15) recon_losses = [] for n_components in n_components_vals: # see how reconstruction loss changes across iterations tmp_recon_losses = [] for i in iterations: rp = GaussianRandomProjection(n_components=n_components, random_state=i) X_rp = rp.fit_transform(X) # calculate reconstruction error X_comp_pinv = np.linalg.pinv(rp.components_.T) X_projection = np.dot(X_rp, X_comp_pinv) recon_loss = ((X - X_projection) ** 2).mean() # if verbose: print(recon_loss.shape) tmp_recon_losses.append(np.sum(recon_loss)) tmp_avg_recon_loss = np.mean(np.array(tmp_recon_losses)) recon_losses.append(tmp_avg_recon_loss) if dataset_name == 'abalone': n_components = 3 else: n_components = 25 # plot reconstruction losses # if verbose: print(recon_losses[0]) recon_losses = np.array(recon_losses) plot_title = "RP for " + dataset_name + ": Reconstruction loss\n" plotting.plot_recon_loss( recon_losses, n_components_vals, title=plot_title) plt.savefig('graphs/rp_' + dataset_name + '_recon_loss.png') plt.clf() # calculate reconstruction error grp = GaussianRandomProjection(n_components=n_components, random_state=RANDOM_SEED) X_rp = grp.fit_transform(X) X_comp_pinv = np.linalg.pinv(grp.components_.T) X_projection = np.dot(X_rp, X_comp_pinv) recon_loss = ((X - X_projection) ** 2).mean() print(dataset_name, ": RP reconstruction loss for k =", n_components, ":", np.sum(recon_loss), '\n') X_rp = pd.DataFrame(X_rp) # run K-means clustering.run_k_means(dataset_name, X_rp, y, dim_reduction='rp', verbose=verbose) # run EM clustering.run_expect_max(dataset_name, X_rp, y, dim_reduction='rp', verbose=verbose) return X_rp
def property_plot(model_name, n_comp, n_cluster, data, label): if model_name == 'PCA': train_PCA = PCA(n_components=n_comp).fit(data) reduced = PCA(n_components=n_comp).fit_transform(data) estimator = KMeans(init=train_PCA.components_, n_clusters=n_cluster, max_iter=2000, n_init=1) elif model_name == 'ICA': train_ICA = FastICA(n_components=n_comp).fit(data) reduced = FastICA(n_components=n_comp).fit_transform(data) estimator = KMeans(init=train_ICA.components_, n_clusters=n_cluster, max_iter=2000, n_init=1) elif model_name == 'RP': train_RP = GaussianRandomProjection(n_components=n_comp).fit(data) reduced = GaussianRandomProjection( n_components=n_comp).fit_transform(data) estimator = KMeans(init=train_RP.components_, n_clusters=n_cluster, max_iter=2000, n_init=1) elif model_name == 'TSVD': train_SVD = TruncatedSVD(n_components=n_comp).fit(data) reduced = TruncatedSVD(n_components=n_comp).fit_transform(data) estimator = KMeans(init=train_SVD.components_, n_clusters=n_cluster, max_iter=2000, n_init=1) elif model_name == 'k-means': reduced = data estimator = KMeans(init='k-means++', n_clusters=n_cluster, max_iter=2000) np.random.seed(99) t0 = time() estimator.fit(data) runtime = time() - t0 dist = estimator.inertia_ h**o = metrics.homogeneity_score(label, estimator.labels_) compl = metrics.completeness_score(label, estimator.labels_) est2 = KMeans(init='k-means++', n_clusters=n_cluster, max_iter=2000).fit(reduced) newlabels = est2.predict(reduced) correct = 1.0 * sum(label == newlabels) / len(label) print( '% 9s %3i %3i %.3f %i %.3f %.3f %.3f' % (model_name, n_cluster, n_comp, runtime, dist, h**o, compl, correct)) return (model_name, n_cluster, n_comp, runtime, dist, h**o, compl, estimator.labels_)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: eps = self.hyperparams['eps'] n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim, eps=eps) if n_components > self._x_dim: self._model = GaussianRandomProjection(n_components=self._x_dim) else: self._model = GaussianRandomProjection(eps=eps) self._model.fit(self._training_data)
def reduce_then_cluster(X,y, dataset_name): # First, PCA n=2 pca = PCA(n_components=2) X_transformed = pca.fit_transform(X) kmeans(X_transformed, y, dataset_name + ' - After PCA (n_components=2)') expectation_maximization(X_transformed, y, dataset_name + ' - After PCA (n_components=2)') # Then, PCA n=3 pca = PCA(n_components=3) X_transformed = pca.fit_transform(X) kmeans(X_transformed, y, dataset_name + ' - After PCA (n_components=3)') expectation_maximization(X_transformed, y, dataset_name + ' - After PCA (n_components=3)') # ICA, n=2 ica = FastICA(n_components=2) X_transformed = ica.fit_transform(X) kmeans(X_transformed, y, dataset_name + ' - After ICA (n_components=2)') expectation_maximization(X_transformed, y, dataset_name + ' - After ICA (n_components=2)') # ICA, n=3 ica = FastICA(n_components=2) X_transformed = ica.fit_transform(X) kmeans(X_transformed, y, dataset_name + ' - After ICA (n_components=3)') expectation_maximization(X_transformed, y, dataset_name + ' - After ICA (n_components=3)') # Random Projections, n=2 rand = GaussianRandomProjection(n_components=2, random_state=65) X_transformed = rand.fit_transform(X) kmeans(X_transformed, y, dataset_name + ' - After Gaussian Random Projection (n_components=2)') expectation_maximization(X_transformed, y, dataset_name + ' - After Gaussian Random Projection (n_components=2)') # Random Projections, n=3 rand = GaussianRandomProjection(n_components=3, random_state=65) X_transformed = rand.fit_transform(X) kmeans(X_transformed, y, dataset_name + ' - After Gaussian Random Projection (n_components=3)') expectation_maximization(X_transformed, y, dataset_name + ' - After Gaussian Random Projection (n_components=3)') # Select K best, k=2 select = SelectKBest(f_classif, k=2) X_transformed = select.fit_transform(X,y) kmeans(X_transformed, y, dataset_name + ' - After 2 Best Features Selected') expectation_maximization(X_transformed, y, dataset_name + ' - After 2 Best Features Selected') # Select K best, k=3 select = SelectKBest(f_classif, k=3) X_transformed = select.fit_transform(X,y) kmeans(X_transformed, y, dataset_name + ' - After 3 Best Features Selected') expectation_maximization(X_transformed, y, dataset_name + ' - After 3 Best Features Selected')
def rand_guas(self, n_comp, data=None): if data is None: data = self.train else: data = pd.DataFrame(data) rand_guas = GaussianRandomProjection(n_components=n_comp) rand_guas.fit(data) self.rand_guas_train_data = rand_guas.transform(data) self.RAND_GUAS = rand_guas rand_test = GaussianRandomProjection(n_components=n_comp) rand_test.fit(self.test) self.rand_guas_test_data = rand_test.transform(self.test)
def compute_neural_network(a, b): ### PCA ### ICA ### RP ### Feature Importance # dataset a clf = PCA(n_components=3) temp = clf.fit_transform(a[0]) print(f'Admissions Dataset: PCA {run_nn(temp, a[1])}') clf = FastICA(n_components=7, random_state=seed) temp = clf.fit_transform(a[0]) print(f'Admissions Dataset: ICA {run_nn(temp, a[1])}') clf = GaussianRandomProjection(n_components=2, random_state=seed) temp = clf.fit_transform(a[0]) print(f'Admissions Dataset: RP {run_nn(temp, a[1])}') important_features = ['CGPA', 'GRE Score', 'TOEFL Score'] temp_data = dict() for feature in important_features: temp_data[feature] = a[0][feature] temp = pd.DataFrame(temp_data) print(f'Admissions Dataset: Feature Importance {run_nn(temp, a[1])}') # dataset b ## 5 PCA, 7 ICA clf = PCA(n_components=5) temp = clf.fit_transform(b[0]) print(f'Income Dataset: PCA {run_nn(temp, b[1])}') clf = FastICA(n_components=7, random_state=seed) temp = clf.fit_transform(b[0]) print(f'Income Dataset: ICA {run_nn(temp, b[1])}') clf = GaussianRandomProjection(n_components=2, random_state=seed) temp = clf.fit_transform(b[0]) print(f'Income Dataset Dataset: RP {run_nn(temp, b[1])}') important_features = ['fnlwgt', 'age', 'education-num'] temp_data = dict() for feature in important_features: temp_data[feature] = b[0][feature] temp = pd.DataFrame(temp_data) print(f'Income Dataset: Feature Importance {run_nn(temp, b[1])}')
def myRCA(data, act_labels, output_folder, experiment_name): # Let's start by exploring the random projections we get for different number of components num_features = data.shape[1] values = list(range(1, num_features)) rn = np.random.RandomState(13) random_seeds = list(rn.randint(1, 1000000, 20)) errors = [] for r in random_seeds: mses = [] for k in values: rca = GaussianRandomProjection(n_components=k, random_state=r).fit(data) trans_data = rca.transform(data) inv_data = np.linalg.pinv(rca.components_.T) rec_data = trans_data.dot(inv_data) mse = MSE(rec_data, data.values) mses.append(mse) errors.append(mses) avg_errors = np.mean(np.array(errors), axis=0) std_errors = np.std(np.array(errors), axis=0) # Graph the reconstruction error per component plt.errorbar(list(range(1, num_features)), avg_errors, std_errors) plt.xticks(ticks=list(range(num_features)), labels=list(range(1, num_features + 1))) plt.xlabel('# Components') plt.ylabel('Reconstruction Error') plt.title( 'Average Reconstruction Error for K Components Over 200 Iterations') plt.savefig(output_folder + '/' + experiment_name + '_rca_component_reconstruction_error.png') plt.close() plt.figure() # Create a final rca to return k = np.argmin(avg_errors) + 1 # add 1 to account for 0 indexing thresh = 0.2 for i in range(len(avg_errors)): if avg_errors[i] <= thresh: k = i + 1 # Add 1 to account for 0 indexing break start_time = time.time() rca = GaussianRandomProjection(n_components=k, random_state=13).fit(data) end_time = time.time() final_time = end_time - start_time return rca, final_time
def comp1(K): Sum_of_squared_distances = [] k = [] accuracy_train = [] accuracy_test = [] score = [] for i in range(1, K): print(i) agglo = GaussianRandomProjection(n_components=10, eps=0.6) #X_new_train,y_new_train=transformer.fit(X_train,y_train) #X_new_test,y_new_test = transformer.transform(X_test,y_test) agglo.fit(X) X_reduced = agglo.transform(X) X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.20) km = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=[8, 8, 8, 8, 8], random_state=1) km.fit(X_train, y_train) km.fit(X_test, y_test) #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5) #transformer2 = GaussianRandomProjection(n_compo label_train = km.predict(X_train) label_test = km.predict(X_test) accu_train = km.score(X_test, y_test) accu_test = km.score(X_train, y_train) #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean') #Sum_of_squared_distances.append(km.inenents=i,eps=0.6) #label=transformer.predicn)rtia_) k.append(i) accuracy_train.append(accu_train) accuracy_test.append(accu_test) #score.append(score_train1) #print(accuracy) k = np.array(k) Sum_of_squared_distances = np.array(Sum_of_squared_distances) score = np.array(score) accuracy_train = np.array(accuracy_train) accuracy_test = np.asarray(accuracy_test) #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o') #line2,=plt.plot(k,score,color='g',marker='o') line3, = plt.plot(k, accuracy_train, color='r', marker='o', label='train_accuracy') line4, = plt.plot(k, accuracy_test, color='g', marker='o', label='test_accuracy') #plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.xlabel('k') plt.legend() plt.ylabel('accuracy') #plt.ylim(0,1) plt.show() return None
def __init__(self, nComp): self._N_COMP = nComp self._pca = PCA(n_components=self._N_COMP, random_state=17) self._tsvd = TruncatedSVD(n_components=self._N_COMP, random_state=17) self._ica = FastICA(n_components=self._N_COMP, random_state=17) self._grp = GaussianRandomProjection(n_components=self._N_COMP, eps=0.1, random_state=17) self._srp = SparseRandomProjection(n_components=self._N_COMP, dense_output=True, random_state=17)
def eps(): Sum_of_squared_distances = [] k = [] score = [] eps = [0.8, 0.6, 0.4, 0.2, 0.05, 0.01] for i in eps: transformer = GaussianRandomProjection(n_components=4, eps=i) X_new = transformer.fit_transform(X) #label=transformer.predict(X) km = KMeans(n_clusters=2, random_state=0, max_iter=10000, tol=1e-9).fit(X_new) #label=km.predict(X_new) #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean') Sum_of_squared_distances.append(km.inertia_) k.append(i) #score.append(score_train1) print(Sum_of_squared_distances) k = np.array(k) Sum_of_squared_distances = np.array(Sum_of_squared_distances) score = np.array(score) line1, = plt.plot(k, Sum_of_squared_distances, 'bx-', marker='o') #line2,=plt.plot(k,score,color='g',marker='o') plt.xlabel('k') plt.ylabel('Sum_of_squared_distances') plt.title('Elbow curve Optimal eps') plt.show() return None
def otherScikitImpl(data, orig_dimension, new_dimension): rp = GaussianRandomProjection(n_components=new_dimension) m = rp._make_random_matrix(new_dimension, orig_dimension) m = np.mat(m) reduced = m * np.mat(data).transpose() reduced = reduced.transpose() return reduced
def RP_exp(X, y, title): ncomp= [i+1 for i in range(X.shape[1]-1)] stdev=[] mean=[] for n in ncomp: repeats = [] for i in range(5): rp = GaussianRandomProjection(n_components=n) temp = rp.fit_transform(X) repeats.append(temp) diffs = [] for (i, j) in [(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]: diffs.append(repeats[i] - repeats[j]) stdev.append(np.std(diffs)) mean.append(np.mean(diffs)) comp_arr=np.array(ncomp) mean_arr=np.array(mean) stdev_arr=np.array(stdev) plt.fill_between(comp_arr, mean_arr-stdev_arr, mean_arr + stdev_arr, alpha=0.1, color="b", label="Stdev") plt.plot(ncomp, mean, 'o-', color="b", label="Mean") plt.title("Mean pairwise difference of RP: "+ title) plt.legend(loc='best') plt.xlabel("n_components") plt.ylabel("Pairwise difference") plt.savefig("RP "+title) plt.show()
def rand_proj_reconstruction_error(train_x, n): ''' ''' results = [] for i in range(1, n, 10): for j in range(1, 11): error = 0 rand_proj = GaussianRandomProjection(n_components=n) reduced_df = rand_proj.fit_transform(train_x) psuedo_inverse = np.linalg.pinv(rand_proj.components_.T) reconstructed = reduced_df.dot(psuedo_inverse) error += metrics.mean_squared_error(train_x, reconstructed) # # error = (np.linalg.norm(train_x - reconstructed) ** 2) / len(train_x) # # error = np.sum(np.square(train_x - reconstructed)) # error = np.mean((train_x - reconstructed)**2) # error = ((train_x - reconstructed) ** 2).sum(1).mean() results.append({"n_components": i, "reconstruction_error": error / 10}) return results
def rand_proj(train_x, n): ''' ''' rp = GaussianRandomProjection(n_components=n) reduced_df = rp.fit_transform(train_x) return reduced_df
def plot_data(method, X, y, title, filename): fig, (ax1) = plt.subplots(1, 1) n_labels = len(y) if method == 'pca': t = decomposition.PCA(n_components=2) X = t.fit_transform(X) elif method == 'ica': t = decomposition.FastICA(n_components=2, whiten=True) X = t.fit_transform(X) elif method == 'rp': t = GaussianRandomProjection(n_components=2) X = t.fit_transform(X) np.random.seed(20) for label in np.unique(y): ax1.scatter(X[y == label, 0], X[y == label, 1], color=np.random.rand(3), linewidths=1) ax1.set_title(title) ax1.grid() plt.tight_layout() plt.savefig('/'.join(['output', filename])) plt.close("all")
def rp(name, x, y): plot.style.use('seaborn-darkgrid') for i in range(6): rp = GaussianRandomProjection(eps=0.95, random_state=i) transformed = rp.fit_transform(x) axes = [0, 0] axes_std = [0, 0] for axis in range(np.shape(transformed)[1]): std = np.std(transformed[:, axis]) if std > axes_std[0]: axes[0] = axis axes_std[0] = std elif std > axes_std[1]: axes[1] = axis axes_std[1] = std plot.subplot(2, 3, i + 1) plot.title(f'Random seed = {i}') plot.xlabel(f'Dimension {axes[0]}') plot.ylabel(f'Dimension {axes[1]}') plot.scatter(transformed[:, axes[0]], transformed[:, axes[1]], c=y, cmap='viridis') plot.show()
def components(K): Sum_of_squared_distances = [] k = [] accuracy = [] score = [] for i in range(1, K): transformer = GaussianRandomProjection(n_components=i, eps=0.1) #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5) #transformer2 = GaussianRandomProjection(n_components=i,eps=0.6) X_new = transformer.fit_transform(X) #label=transformer.predict(X) km = KMeans(n_clusters=2, random_state=0, max_iter=10000, tol=1e-9).fit(X_new) label = km.predict(X_new) accu = matchfn(y, label) #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean') Sum_of_squared_distances.append(km.inertia_) k.append(i) accuracy.append(accu) #score.append(score_train1) #print(Sum_of_squared_distances) k = np.array(k) Sum_of_squared_distances = np.array(Sum_of_squared_distances) score = np.array(score) accuracy = np.array(accuracy) #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o') #line2,=plt.plot(k,score,color='g',marker='o') line3, = plt.plot(k, accuracy, color='r', marker='o') plt.xlabel('k') plt.ylabel('accuracy') #plt.title('Elbow curve Optimal k') #plt.ylim(0,1) plt.show() return None
def dimensionality_reduction(): ica_best_components = 5 pca_best_components = 6 rp_chosen_components = 3 variance_threshold = 0.02 pca = PCA(n_components=pca_best_components) pca_x_train = pca.fit_transform(x_train) base_experiment.plot_eigen_values("{}-{}".format(plot_name, "PCA"), pca.explained_variance_) base_experiment.plot_points_3d("{}-{}".format(plot_name, "PCA"), pca_x_train) ica = FastICA(n_components=ica_best_components) ica_x_train = ica.fit_transform(x_train) base_experiment.plot_points_3d("{}-{}".format(plot_name, "ICA"), ica_x_train) rp = GaussianRandomProjection(n_components=rp_chosen_components) rp_x_train = rp.fit_transform(x_train) base_experiment.plot_points_3d( "{}-{}".format(plot_name, "Random Projection"), rp_x_train) variance_x_train = VarianceThreshold( threshold=variance_threshold).fit_transform( min_max_scaler.transform(features_data)) variance_x_train = preprocessing.scale(variance_x_train) find_best_k_for_reduced_features(ica_x_train, pca_x_train, rp_x_train, variance_x_train) clustering_after_reduction(pca_x_train, ica_x_train, rp_x_train, variance_x_train) run_ann_with_only_dimensionality_reduction(pca_x_train, ica_x_train, rp_x_train, variance_x_train)
def run_k_means_on_random_projections_cardiovascular_data(path): data_set = 'cardio' x_train, y_train = load_data(path + 'data/' + data_set + '/train/') # X, y = load_data(path + 'data/' + data_set + '/train/') pca = GaussianRandomProjection(n_components=5) pca_x_train = pca.fit_transform(x_train) f = open("cardiovascular_random_projections_stats.txt","w+") bench_k_means("1", pca_x_train, y_train, 1, f, 1) bench_k_means("2", pca_x_train, y_train, 2, f, 1) bench_k_means("3", pca_x_train, y_train, 3, f, 1) bench_k_means("4", pca_x_train, y_train, 4, f, 1) bench_k_means("5", pca_x_train, y_train, 5, f, 1) bench_k_means("6", pca_x_train, y_train, 6, f, 1) bench_k_means("7", pca_x_train, y_train, 7, f, 1) bench_k_means("8", pca_x_train, y_train, 8, f, 1) bench_k_means("9", pca_x_train, y_train, 9, f, 1) bench_k_means("10", pca_x_train, y_train, 10, f, 1) bench_k_means("11", pca_x_train, y_train, 11, f, 1) bench_k_means("12", pca_x_train, y_train, 12, f, 1) bench_k_means("13", pca_x_train, y_train, 13, f, 1) bench_k_means("14", pca_x_train, y_train, 14, f, 1) bench_k_means("15", pca_x_train, y_train, 15, f, 1) f.close()
def rp(X, y, n_components='auto', eps=0.1, random_state=None, plot=1, dataset='german'): rp_model = GaussianRandomProjection(n_components=n_components, eps=eps, random_state=random_state) rp_model.fit(X) X_new = rp_model.transform(X) if plot: if dataset == 'german': plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1], c='red', label='Samples with label 1') plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1], c='green', label='Samples with label 0') plt.title("German dataset after Randomized Projection") plt.legend() plt.xlabel("Component 1") plt.ylabel("Component 2") plt.savefig("german-after-Random-Projection.png") plt.close() elif dataset == 'australian': plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1], c='red', label='Samples with label 1') plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1], c='green', label='Samples with label 0') plt.title("Australian dataset after Randomized Projection") plt.legend() plt.xlabel("Component 1") plt.ylabel("Component 2") plt.savefig("australian-after-Random-Projection.png") plt.close() return X_new
def save_new_data(dataset, n_components, iteration): X, y = load_dataset(dataset) data = X rp = GaussianRandomProjection(n_components=n_components) rp.fit(data) matrix = rp.components_ new_data = rp.transform(data) plot_data('rp', new_data, y, dataset.title() + ': RP', filename='-'.join( ['rp', dataset, str(iteration), 'data', 'trans'])) results = np.array(new_data) np.savetxt('data/' + ('-'.join( [dataset, str(n_components), str(iteration) + 'rp.csv'])), results, delimiter=",") new_data_inv = np.dot(new_data, matrix) loss = metrics.mean_squared_error(data, new_data_inv) print loss
def train_reduc(data, reduc_type='pca', kernel='rbf', n_c=8, eps=0.01, random_state=2020): if reduc_type == 'pca': reduc = PCA(n_components=n_c) elif reduc_type == 'spca': reduc = SparsePCA(n_components=n_c) elif reduc_type == 'kpca': reduc = KernelPCA(n_components=n_c, kernel=kernel) elif reduc_type == 'ica': reduc = FastICA(n_components=n_c) elif reduc_type == 'grp': reduc = GaussianRandomProjection(n_components=n_c, eps=eps, random_state=random_state) elif reduc_type == 'srp': reduc = SparseRandomProjection(n_components=n_c, density='auto', eps=eps, dense_output=True, random_state=random_state) reduced = reduc.fit_transform(data) print('Reduc Complete') return reduced, reduc
def get_transform(algorithm): """ Defines and returns a feature selection transform object of the designated type. Parameters ---------- algorithm : {'pca', 'kpca', 'grp', 'fa', 'k_best'} Transform algorithm to return an object. Returns ---------- transform : object Instantiated transform object. """ if algorithm == 'pca': transform = PCA() elif algorithm == 'kpca': transform = KernelPCA() elif algorithm == 'grp': transform = GaussianRandomProjection() elif algorithm == 'fa': transform = FeatureAgglomeration() elif algorithm == 'k_best': transform = SelectKBest(mutual_info_regression) else: raise Exception( 'No selection algorithm defined for {0}'.format(algorithm)) return transform
def fit(self, X, y=None): r""" Fit to the singleview data. Parameters ---------- X : array of shape (n_samples, n_total_features) Input dataset y : Ignored Returns ------- self : object The Transformer instance """ # set function level random state np.random.seed(self.random_state) self.GaussianRandomProjections_ = [ GaussianRandomProjection(n_components=self.n_components, eps=self.eps).fit(X) for _ in range(self.n_views) ] return self
def create_random_guassian_projections(params, x_data): components = params['components'] grps = [GaussianRandomProjection(n_components=components) for _ in range(params['num_retry'])] x_data_news = [] x_data_recons = [] x_data_projection_losses = [] for i in range(0, params['num_retry']): print(str(i)) # project data from high dim to low dim x_data_news.append(grps[i].fit_transform(x_data)) # now reconstruct the data by projecting it back into higher dimensions x_data_recons.append(np.dot(x_data_news[i], grps[i].components_)) # calculate projection errors x_projection_loss = ((x_data - x_data_recons) ** 2).mean() x_data_projection_losses.append(x_projection_loss) if params['projection_loss_graph'] is not None: plt.figure() plt.plot(x_data_projection_losses) plt.ylabel("Mean Squared Error") plt.xlabel("Random Model") plt.title(params['projection_loss_graph']) plt.savefig(params['projection_loss_graph'] + '.png') i = np.argmin(x_data_projection_losses) return x_data_news[i]