def part2(): tmp = defaultdict(dict) for i, dim in product(range(10), range(1, 31)): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancer_x), cancer_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), range(1, 31)): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(cancer_x) tmp[dim][i] = reconstructionError(rp, cancer_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'cancer part2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims_big): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(housing_x) tmp[dim][i] = reconstructionError(rp, housing_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'housing part2.csv')
def rp(X_train, X_test): num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.1) print(num_components) print("# features: ", X_train.shape[1], " JL min dim:", num_components) print("JL number > #features so cant make any JL guarentees") # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points. accuracies = [] components = np.int32(np.linspace(1, 19, 19)) model = LinearSVC() model.fit(X_train, y_train) baseline = metrics.accuracy_score(model.predict(X_test), y_test) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components=comp) X = sp.fit_transform(X_train) # train a classifier on the sparse random projection # TODO this is wrong.. needs to be KMeans model = LinearSVC(max_iter=1000) model.fit(X, y_train) # evaluate the model and update the list of accuracies test = sp.transform(X_test) accuracies.append(metrics.accuracy_score(model.predict(test), y_test)) # create the figure plt.figure() plt.title("Accuracy of Sparse Random Projection on Churn") plt.xlabel("# of Components") plt.ylabel("Accuracy") plt.xlim([1, 20]) plt.ylim([0, 1.0]) # plot the baseline and random projection accuracies plt.plot(components, [baseline] * len(accuracies), color="r") plt.plot(components, accuracies) plt.show() # average looks to be around 5 components in RP to best the baseline sp = SparseRandomProjection(n_components = 5) X_transformed = sp.fit_transform(X_train) km = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, random_state=RAND) plot_silhouette(km, X_transformed, title="SRP(5) KM(2)") km = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, random_state=RAND) plot_silhouette(km, X_transformed, title="SRP(5) KM(3)")
def part3(): dim = 5 rp = SparseRandomProjection(n_components=dim, random_state=5) cancer_x2 = rp.fit_transform(cancer_x) dim = 9 rp = SparseRandomProjection(n_components=dim, random_state=5) housing_x2 = rp.fit_transform(housing_x) run_clustering(out, cancer_x2, cancer_y, housing_x2, housing_y)
def generate(self, train, val, test, n_comps): decomposer = SparseRandomProjection(n_components=n_comps, random_state=1234) results_train = decomposer.fit_transform(train) results_val = decomposer.fit_transform(val) results_test = decomposer.transform(test) for i in range(1, n_comps + 1): train[self.featurename(i)] = results_train[:, i - 1] val[self.featurename(i)] = results_val[:, i - 1] test[self.featurename(i)] = results_test[:, i - 1] return (train, val, test)
def rp(train, test, y_train, y_test): model = LinearSVC() model.fit(train, y_train) baseline = metrics.accuracy_score(model.predict(X_test), y_test) accuracies = [] components = np.int32(np.linspace(2, 60, 20)) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components=comp) X = sp.fit_transform(train) # train a classifier on the sparse random projection model = LinearSVC() model.fit(X, y_train) # evaluate the model and update the list of accuracies test = sp.transform(X_test) accuracies.append(metrics.accuracy_score(model.predict(test), y_test)) # create the figure plt.figure() plt.title("Accuracy of Sparse Rand Projection on Sonar (EM, GMM)") plt.xlabel("# of Components") plt.ylabel("Accuracy") plt.xlim([2, 64]) plt.ylim([0, 1.0]) # plot the baseline and random projection accuracies plt.plot(components, [baseline] * len(accuracies), color="r") plt.plot(components, accuracies) plt.show() #random pick 30 as the best number of Random components sp = SparseRandomProjection(n_components=30) X_train = sp.fit_transform(train) gmm = mixture.GaussianMixture(2, covariance_type='full', random_state=RAND) gmm.fit(X_train) plot_silhouette(gmm, X_train, title="RP(30), GMM(2)") gmm = mixture.GaussianMixture(3, covariance_type='full', random_state=RAND) gmm.fit(X_train) plot_silhouette(gmm, X_train, title="RP(30), GMM(3)") gmm = mixture.GaussianMixture(4, covariance_type='full', random_state=RAND) gmm.fit(X_train) plot_silhouette(gmm, X_train, title="RP(30), GMM(4)")
def run_RP(X, y, title): from itertools import product dims = list(np.arange(2, (X.shape[1] - 1), 3)) dims.append(X.shape[1]) tmp = defaultdict(dict) for i, dim in product(range(5), dims): rp = RP(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X) tmp = pd.DataFrame(tmp).T mean_recon = tmp.mean(axis=1).tolist() std_recon = tmp.std(axis=1).tolist() fig, ax1 = plt.subplots() ax1.plot(dims, mean_recon, 'b-') ax1.set_xlabel('Random Components') # Make the y-axis label, ticks and tick labels match the line color. ax1.set_ylabel('Mean Reconstruction Correlation', color='b') ax1.tick_params('y', colors='b') plt.grid(False) ax2 = ax1.twinx() ax2.plot(dims, std_recon, 'm-') ax2.set_ylabel('STD Reconstruction Correlation', color='m') ax2.tick_params('y', colors='m') plt.grid(False) plt.title("Random Components for 5 Restarts: " + title) fig.tight_layout() d = plotsdir + "/" + title if not os.path.exists(d): os.makedirs(d) plt.savefig(d + "/Random Components for 5 Restarts.png")
def run_rp(n_projections, dataset, data): file = './results/srp_' + dataset + '.csv' with open(file, 'w') as f: f.write('{},{},{},{}\n'.format("n_components", "reconstruction_error_mean", "reconstruction_error_sigma", "runtime")) for n in n_projections: errors = [] for i in range(1, 5): start = time.time() srp = SparseRandomProjection(n_components=n) # srp = GaussianRandomProjection(n_components=n) trans_x = srp.fit_transform(data) end = time.time() elapsed = end - start error = reconstruction_error(srp, data) errors.append(error) # print("For SRP n_components of: ", n, " on ", dataset, " data set got reconstruction error of: ", error, " in time: ", elapsed) error_mean = np.mean(errors) error_sigma = np.std(errors) print("For SRP n_components of: ", n, " on ", dataset, " data set got mean reconstruction error of: ", error_mean, " with sigma of: ", error_sigma) with open(file, 'a') as f: f.write('{},{},{},{}\n'.format(n, error_mean, error_sigma, elapsed)) return
def rp(train, test, y_train, y_test): sp = SparseRandomProjection(n_components=12) X_train = sp.fit_transform(train) X_test = sp.transform(test) clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(70, ), random_state=23, shuffle=True, activation='relu', learning_rate_init=0.15, alpha=0.45) run_analysis( X_train, y_train, clf, "NN with lrate=0.15, 70 units in hidden layer, alpha 0.45, RP(12)") clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(70, ), random_state=23, shuffle=True, activation='relu', learning_rate_init=0.15, alpha=0.45) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(confusion_matrix(y_test, y_pred)) plot_confusion_matrix(y_test, y_pred, title="NN (70,) lrate=0.15, RP(12)")
def randomProjections(data, n_components): pWDC = {} rError = {} for iterN in range(1, n_components): rp = SparseRandomProjection(n_components = iterN, random_state= seed) rpCopy = rp pWDC[iterN] = pairwiseDistCorr(rp.fit_transform(data), data) rpCopy.fit(data) rError[iterN] = reconstructionError(rpCopy, data) plt.subplot(2, 1, 1) plt.plot(list(pWDC.keys()), list(pWDC.values())) plt.xlabel("Number of Components") plt.ylabel("Pair-wise Distance Correlation") plt.subplot(2, 1, 2) plt.plot(list(rError.keys()), list(rError.values())) plt.xlabel("Number of Components") plt.ylabel("Reconstruction Error") return plt, pairwiseDistCorr, reconstructionError, rp
def perform(self): # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/RP.py self.log("Performing {}".format(self.experiment_name())) # TODO: Use a diff random state? Might be ok as-is # %% Data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), self._dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwise_dist_corr(rp.fit_transform(self._details.ds.training_x), self._details.ds.training_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(self._out.format('{}_scree1.csv'.format(self._details.ds_name))) tmp = defaultdict(dict) for i, dim in product(range(10), self._dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(self._details.ds.training_x) tmp[dim][i] = reconstruction_error(rp, self._details.ds.training_x) tmp = pd.DataFrame(tmp).T tmp.to_csv(self._out.format('{}_scree2.csv'.format(self._details.ds_name))) # %% Data for 2 grid = {'rp__n_components': self._dims, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch} rp = SparseRandomProjection(random_state=self._details.seed) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) pipe = Pipeline([('rp', rp), ('NN', mlp)], memory=experiments.pipeline_memory) gs, final_estimator = self.gs_with_best_estimator(pipe, grid) self.log("Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(self._out.format('{}_dim_red.csv'.format(self._details.ds_name))) self.log("Done")
def train(X_train, y_train, project=False, rnd=42, **kwargs): if project is not False: if project == 'rproj': proj = SparseRandomProjection(n_components=X_train.shape[1], random_state=rnd) elif project == 'std': proj = StandardScaler() elif project == 'pca': proj = PCA(n_components='mle', whiten=True, random_state=rnd) elif project == 'rpca': proj = RandomizedPCA(whiten=True, random_state=rnd) elif project == 'rbf': proj = RBFSampler(n_components=max(X_train.shape[1], 50), random_state=rnd) else: raise Error('Projection {} not available'.format(project)) X_train = proj.fit_transform(X_train) kwargs.setdefault('random_state', rnd) clf = RandomForestClassifier(**kwargs) clf.fit(X_train, y_train) if project is not False: return clf, proj return clf
def _train_classifier(clf, X_train, y_train, rnd=42, project=None): if project is not None and project != 'None': log.info('+ Projecting features') if project == 'random_projection': log.info(' * Sparse Random Projection') proj = SparseRandomProjection(n_components=X_train.shape[1], random_state=rnd) elif project == 'standard': log.info(' * Standard Projection') proj = StandardScaler() elif project == 'pca': log.info(' * Principle Component Analysis') proj = IncrementalPCA(batch_size=100) elif project == 'random_pca': log.info(' * Randomized Principle Component Analysis') proj = PCA(n_components=X_train.shape[1], svd_solver='randomized', whiten=True, random_state=rnd) else: log.error('Projection {} not available'.format(project)) return X_train = proj.fit_transform(X_train) log.info('+ Training classifier') clf.fit(X_train, y_train)
def rp_dim_reduction(self, x, y, title, dim_max): print('Random Projections') rp_error = [] scaler = StandardScaler() scaler.fit(x) n_min_reconstruction_error = -1 components_min_error = None min_error = float("inf") for n in range(2, dim_max + 1): rp = SparseRandomProjection(n_components=n, random_state=123) rp_result = rp.fit_transform(x) reconstruction_error = self.reconstructionError(rp, x) rp_error.append(reconstruction_error) if reconstruction_error < min_error: n_min_reconstruction_error = n min_error = reconstruction_error plt.figure() plt.plot(range(1, dim_max), rp_error, 'bx-') plt.xlabel('n_components') plt.ylabel('Reconstructed Error') plt.savefig('image/' + title + '/rp_train.png') plt.close() return n_min_reconstruction_error
def run_RCA(X, title): dims = list(np.arange(2, (X.shape[1] - 1), 3)) dims.append(X.shape[1]) tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = RCA(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X) tmp = pd.DataFrame(tmp).T mean_recon = tmp.mean(axis=1).tolist() std_recon = tmp.std(axis=1).tolist() fig, ax1 = plt.subplots() ax1.plot(dims, mean_recon, 'b-') ax1.set_xlabel('Random Components') ax1.set_ylabel('Mean Reconstruction Correlation', color='b') ax1.tick_params('y', colors='b') plt.grid(False) ax2 = ax1.twinx() ax2.plot(dims, std_recon, 'm-') ax2.set_ylabel('STD Reconstruction Correlation', color='m') ax2.tick_params('y', colors='m') plt.grid(False) plt.title("Random Components for 5 Restarts: " + title) fig.tight_layout() plt.show()
def sample_proj_mat(self, sample_inds): """ Gets the projection matrix and it fits the transform to the samples of interest. Parameters ---------- sample_inds : array of shape [n_samples] The data we are transforming. Returns ------- proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) The generated sparse random matrix. proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) Projected matrix. """ proj_mat = SparseRandomProjection( density=self.density, n_components=self.proj_dims, random_state=self.random_state, ) proj_X = proj_mat.fit_transform(self.X[sample_inds, :]) return proj_X, proj_mat
def PairwiseDistribution(self, data): n_samples, n_features = data.shape print( "Embedding %d samples with dim %d using various random projections" % (n_samples, n_features)) n_components = 4 dists = euclidean_distances(data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(data) projected_dists = euclidean_distances(projected_data, squared=True).ravel()[nonzero] plt.figure() plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu) plt.xlim([0, 150]) plt.ylim([0, 150]) plt.xlabel("Pairwise squared distances in original space") plt.ylabel("Pairwise squared distances in projected space") plt.title("Pairwise distances distribution for n_components= 4") cb = plt.colorbar() cb.set_label('Sample pairs counts') #cb.ax.set_yticklabels(['0','250', '500', '750', '1000', '1250']) plt.savefig("Plots/RandomProjection/pairwisedist2.png")
class SRP: def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = SparseRandomProjection(*args, **kwargs) def fit(self, X, y): pass def predict(self, X): Z = numpy.concatenate([X], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan nan_mask = ~pandas.isna(Z).any(axis=1) X_ = X[nan_mask, :] if Z.shape[0] != X.shape[0]: print( 'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: predicted = self.model.fit_transform(X_) Z = numpy.full(shape=(X.shape[0], predicted.shape[1]), fill_value=numpy.nan, dtype=numpy.float64) Z[nan_mask, :] = predicted return Z
def rpFluctuation(dims, ds, X): tmp = defaultdict(dict) for i, dim in product(range(10), dims): print(i, dim) rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X) tmp = pd.DataFrame(tmp).T tmp.to_csv('{}/{}_comparison.csv'.format(OUT, ds))
def project(file_name, dimensions): data = load_svmlight_file(file_name) projector = SparseRandomProjection(dimensions, 1/3.0, dense_output=True) projected = projector.fit_transform(data[0]) new_file_name = file_name[:-4] + '-' + str(dimensions) + '.mat' new_file = open(new_file_name, 'wb') dump_svmlight_file(projected, data[1], new_file)
def Random(X, labels, imgs, **kwargs): # Random 2D projection using a random unitary matrix print("Computing random projection") t = time() rp = SparseRandomProjection( n_components=2, random_state=0) X_projected = rp.fit_transform(X) plot_embedding(X_projected, labels, imgs, "Random Projection of the dataset (time %.2fs)" % (time() - t), **kwargs)
def reduce_data(self, data): scaler = preprocessing.Normalizer() normalized_x = scaler.fit_transform(data) randomized_projection = SparseRandomProjection(n_components=3) dim_reduced_x = randomized_projection.fit_transform(normalized_x) return dim_reduced_x
def engineer2(train, test): myfeats = [f for f in train.columns if f not in ['UCIC_ID','Responders']] scaler = StandardScaler() slr = scaler.fit(train[myfeats]) dim_train = slr.transform(train[myfeats]) dim_test = slr.transform(test[myfeats]) n_comp = 10 print('Starting decomposition.........\n') tsvd = TruncatedSVD(n_components=n_comp, random_state=42) tsvd_train = tsvd.fit_transform(dim_train) tsvd_test = tsvd.transform(dim_test) pca = PCA(n_components=n_comp, random_state=420) pca_train = pca.fit_transform(dim_train) pca_test = pca.transform(dim_test) ica = FastICA(n_components=n_comp, random_state=2030) ica_train = ica.fit_transform(dim_train) ica_test = ica.transform(dim_test) grp = GaussianRandomProjection(n_components=n_comp, random_state=42) grp_train = grp.fit_transform(dim_train) grp_test = grp.transform(dim_test) srp = SparseRandomProjection(n_components=n_comp, random_state=42) srp_train = srp.fit_transform(dim_train) srp_test = srp.transform(dim_test) for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca_train[:,i-1] test['pca_' + str(i)] = pca_test[:,i-1] train['tsvd_' + str(i)] = tsvd_train[:,i-1] test['tsvd_' + str(i)] = tsvd_test[:,i-1] train['ica_' + str(i)] = ica_train[:,i-1] test['ica_' + str(i)] = ica_test[:,i-1] train['grp_' + str(i)] = grp_train[:,i-1] test['grp_' + str(i)] = grp_test[:,i-1] train['srp_' + str(i)] = srp_train[:,i-1] test['srp_' + str(i)] = srp_test[:,i-1] del dim_train, dim_test return train, test
def getRCAData(X, dataType): if dataType == 'Adult': components = 40 else: components = 25 transformer = SparseRandomProjection(n_components=components) transformed = transformer.fit_transform(X) return transformed
def flastVectorization(dataPoints, reduceDim=True, dim=0, eps=0.33): countVec = CountVectorizer() Z_full = countVec.fit_transform(dataPoints) if reduceDim: if dim <= 0: dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps) srp = SparseRandomProjection(n_components=dim) Z = srp.fit_transform(Z_full) return Z else: return Z_full
def apply_SRP(table, features, label, n_components): from sklearn.random_projection import SparseRandomProjection from paje import feature_file_processor x, y = feature_file_processor.split_features_target(table, features, label) rp = SparseRandomProjection \ (n_components=n_components, dense_output=True, random_state=420) pc = rp.fit_transform(x) return feature_file_processor.generate_data_frame(pc, table[[label]])
def getDR(dt_all, n_comp=12): # cols cols_encode_label = dt_all.filter( regex="Encode_Label").columns.values.tolist() cols_cat = dt_all.drop( "ID", axis=1).select_dtypes(include=["object"]).columns.tolist() # standardize dt_all_norm = MinMaxScaler().fit_transform( dt_all.drop(["y", "Fold"] + cols_cat + cols_encode_label, axis=1)) # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results = tsvd.fit_transform(dt_all_norm) # PCA pca = PCA(n_components=n_comp, random_state=420) pca_results = pca.fit_transform(dt_all_norm) # ICA ica = FastICA(n_components=n_comp, max_iter=5000, random_state=420) ica_results = ica.fit_transform(dt_all_norm) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results = grp.fit_transform(dt_all_norm) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results = srp.fit_transform(dt_all_norm) # NMF nmf = NMF(n_components=n_comp, init='nndsvdar', random_state=420) nmf_results = nmf.fit_transform(dt_all_norm) # F*G f*g = FeatureAgglomeration(n_clusters=n_comp, linkage='ward') fag_results = f*g.fit_transform(dt_all_norm) # Append decomposition components to datasets for i in range(1, n_comp + 1): dt_all['DR_TSVD_' + str(i)] = tsvd_results[:, i - 1] dt_all['DR_PCA_' + str(i)] = pca_results[:, i - 1] dt_all['DR_ICA_' + str(i)] = ica_results[:, i - 1] dt_all['DR_GRP_' + str(i)] = grp_results[:, i - 1] dt_all['DR_SRP_' + str(i)] = srp_results[:, i - 1] dt_all['DR_NMF_' + str(i)] = nmf_results[:, i - 1] dt_all['DR_FAG_' + str(i)] = fag_results[:, i - 1] return (dt_all)
def SSPCA_V(X, k): # overall O(M * k + c_3 * p * k^2) transformer = SparseRandomProjection(n_components=k, random_state=0) #O(M * k) Y = transformer.fit_transform(X) #O(M * k) B = safe_sparse_dot(Y.T, X).toarray() #O(p * k^2) U, S, V = np.linalg.svd(B) V = V[:k] return V.T
def preprocess(X, y): X = np.array([x.flatten() for x in X]) y = np.array([one_hot(y_item) for y_item in y]) scaler = MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit(X) X = scaler.transform(X) #print(X.shape) #(173, 3840000) # reduce principle components to improve performance sp = SparseRandomProjection(n_components=int(5792)) X = sp.fit_transform(X) return np.array(X), y
def rp(X_train, y_train, X_test, y_test): accuracies = [] components = np.int32(np.linspace(2, 64, 20)) model = LinearSVC() model.fit(X_train, y_train) baseline = metrics.accuracy_score(model.predict(X_test), y_test) # loop over the projection sizes # for comp in components: # # create the random projection # sp = SparseRandomProjection(n_components=comp, random_state=RAND) # X = sp.fit_transform(X_train) # # # train a classifier on the sparse random projection # model = LinearSVC() # model.fit(X, y_train) # # # evaluate the model and update the list of accuracies # test = sp.transform(X_test) # accuracies.append(metrics.accuracy_score(model.predict(test), y_test)) # # # create the figure # plt.figure() # plt.title("Accuracy of Sparse Projection on Churn") # plt.xlabel("# of Components") # plt.ylabel("Accuracy") # plt.xlim([2, 64]) # plt.ylim([0, 1.0]) # # # plot the baseline and random projection accuracies # plt.plot(components, [baseline] * len(accuracies), color="r") # plt.plot(components, accuracies) # # print("Average of 4 runs, first better than baseline ave 12 components") # # plt.show() sp = SparseRandomProjection(n_components=12, random_state=RAND) X = sp.fit_transform(X_train) em = mixture.GaussianMixture(2, covariance_type='full', random_state=RAND) plot_silhouette(em, X, title="RP, K=2, 12 RC") em = mixture.GaussianMixture(3, covariance_type='full', random_state=RAND) plot_silhouette(em, X, title="RP, K=3, 12 RC") em = mixture.GaussianMixture(4, covariance_type='full', random_state=RAND) plot_silhouette(em, X, title="RP, K=4, 12 RC")
def get_additional_features(train, test, magic=False, ID=False): col = list(test.columns) if ID != True: col.remove('ID') n_comp = 12 # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results_train = tsvd.fit_transform(train[col]) tsvd_results_test = tsvd.transform(test[col]) # PCA pca = PCA(n_components=n_comp, random_state=420) pca2_results_train = pca.fit_transform(train[col]) pca2_results_test = pca.transform(test[col]) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train[col]) ica2_results_test = ica.transform(test[col]) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train[col]) grp_results_test = grp.transform(test[col]) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train[col]) srp_results_test = srp.transform(test[col]) for i in range(1, n_comp + 1): train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] if magic == True: magic_mat = train[['ID', 'X0', 'y']] magic_mat = magic_mat.groupby(['X0'])['y'].mean() magic_mat = pd.DataFrame({ 'X0': magic_mat.index, 'magic': list(magic_mat) }) mean_magic = magic_mat['magic'].mean() train = train.merge(magic_mat, on='X0', how='left') test = test.merge(magic_mat, on='X0', how='left') test['magic'] = test['magic'].fillna(mean_magic) return train, test
def ML_SRP(X_train, n_components, density, eps, dense_output, random_state): from sklearn.random_projection import SparseRandomProjection import pandas as pd srp = SparseRandomProjection( n_components=n_components, density=density, eps=eps, dense_output=dense_output, random_state=random_state, ) X_train_PCA = srp.fit_transform(X_train) X_train_PCA = pd.DataFrame(data=X_train_PCA) return X_train_PCA
def plotProjection(data, n_samples, n_features): n_components_range = np.array([300, 1000, 10000]) dists = euclidean_distances(data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range: t0 = time() rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(data) print("Projected %d samples from %d to %d in %.3fs" \ % (n_samples, \ n_features, \ n_components, \ time() - t0)) if hasattr(rp, 'components_'): n_bytes = rp.components_.data.nbytes n_bytes += rp.components_.indices.nbytes print("Random matrix with size: %.3fMB" % (n_bytes / 1e6)) projected_dists = euclidean_distances(projected_data, squared=True) projected_dists = projected_dists.ravel()[nonzero] rates = projected_dists / dists print("Mean distances rate: %.2f (%.2f)" \ % (np.mean(rates), \ np.std(rates))) plotHexbin(dists, projected_dists, n_components) plotHist(rates, n_components)
def create_sector_subset(sample_n, X_output_path, Y_output_path): X_path = "/cs/puls/Experiments/hxiao-test/feature-data.mat" Y_path = "/cs/puls/Experiments/hxiao-test/label-data.mat" X = loadmat(X_path)["featureData"] Y = loadmat(Y_path)["labelData"] print "Applying random projection to reduce dimension" print "Shape before: %r" % (X.shape,) transformer = SparseRandomProjection(random_state=0) X = transformer.fit_transform(X) print "Shape after: %r" % (X.shape,) print "Random projection: OFF" rng = np.random.RandomState(0) print "Sample size: %d" % sample_n rows = rng.permutation(X.shape[0])[:sample_n] X = X[rows, :] Y = Y[rows, :] dump(X, open(X_output_path, "w")) dump(Y, open(Y_output_path, "w"))
cPickle.dump(articles, f, protocol=-1) print "saving done" print len(articles) vec = TfidfVectorizer(max_df=0.8, sublinear_tf=True) X = vec.fit_transform(articles) print X.shape proj = SparseRandomProjection() X = proj.fit_transform(X) print X.shape sparse_save(X,"../data/tfidf.h5") # f = open('X_data.p', 'wb') # cPickle.dump(X.data, f, protocol=-1) # f = open('X_indices.p', 'wb') # cPickle.dump(X.indices, f, protocol=-1) # f = open('X_indptr.p', 'wb') # cPickle.dump(X.indptr, f, protocol=-1) #X = normalize(X) # compute the inverse of l2 norm of non-zero elements
faces_data = fetch_olivetti_faces().data n_samples, n_features = faces_data.shape print "Embedding %d faces with dim %d using various random projections" % ( n_samples, n_features) n_components_range = np.array([50, 200, 1000]) dists = euclidean_distances(faces_data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range: rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(faces_data) projected_dists = euclidean_distances( projected_data, squared=True).ravel()[nonzero] pl.figure() pl.hexbin(dists, projected_dists, gridsize=100) pl.xlabel("Pairwise squared distances in original space") pl.ylabel("Pairwise squared distances in projected space") pl.title("Pairwise distances distribution for n_components=%d" % n_components) cb = pl.colorbar() cb.set_label('Sample pairs counts') rates = projected_dists / dists pl.figure()
def sparseRP(data): rp = SparseRandomProjection(n_components=new_dimension) return rp.fit_transform(data)
all_data_proc = pd.concat((df_obj_feats_freq, df_non_obj_feats), axis=1) #%% from sklearn.decomposition import PCA, FastICA from sklearn.random_projection import GaussianRandomProjection from sklearn.random_projection import SparseRandomProjection n_comp = 12 # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results = grp.fit_transform(all_data_proc) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results = srp.fit_transform(all_data_proc) # PCA pca = PCA(n_components=n_comp, random_state=420) pca_results = pca.fit_transform(all_data_proc) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica_results = ica.fit_transform(all_data_proc) for i in range(1, n_comp+1): all_data_proc['pca_' + str(i)] = pca_results[:,i-1] all_data_proc['ica_' + str(i)] = ica_results[:, i-1] all_data_proc['grp_' + str(i)] = grp_results[:,i-1] all_data_proc['srp_' + str(i)] = srp_results[:, i-1] df_X_train = all_data_proc[:train_len]
n_samples, n_features = data.shape print("Embedding %d samples with dim %d using various random projections" % (n_samples, n_features)) n_components_range = np.array([300, 1000, 10000]) dists = euclidean_distances(data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range: t0 = time() rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(data) print("Projected %d samples from %d to %d in %0.3fs" % (n_samples, n_features, n_components, time() - t0)) if hasattr(rp, 'components_'): n_bytes = rp.components_.data.nbytes n_bytes += rp.components_.indices.nbytes print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6)) projected_dists = euclidean_distances( projected_data, squared=True).ravel()[nonzero] plt.figure() plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu) plt.xlabel("Pairwise squared distances in original space") plt.ylabel("Pairwise squared distances in projected space") plt.title("Pairwise distances distribution for n_components=%d" %
# Perform Randomized Principal Components Analysis (PCA) from sklearn.decomposition import RandomizedPCA as RPCA rpca = RPCA(n_components=num_components) rpca_transformed_data_train = rpca.fit_transform(dense_trainData) rpca_transformed_data_valid = rpca.transform(dense_validData) # Perform Gaussian Random Projection from sklearn.random_projection import GaussianRandomProjection as GaussRan grp = GaussRan(n_components=num_components) grp_transformed_data_train = grp.fit_transform(dense_trainData) grp_transformed_data_valid = grp.transform(dense_validData) # Perform Sparse Random Projection from sklearn.random_projection import SparseRandomProjection as SparseRan srp = SparseRan(n_components=num_components, random_state=0) srp_transformed_data_train = srp.fit_transform(dense_trainData) srp_transformed_data_valid = srp.transform(dense_validData) # Perform classification using 1-Nearest Neighbor Classifier from sklearn.neighbors import KNeighborsClassifier # Create a subset grid to plot performance against numbers of components tsvd_max = tsvd_transformed_data_train.shape[1] plot_subset = [] length_of_plot_subset = len(plot_subset) if tsvd_max < 101: spacing = super_fine_spacing plot_subset = [] for j in arange(1, spacing - 1): plot_subset.append(j) quotient = tsvd_max / spacing
def gen_feature(train, test): train = pd.DataFrame(train) test = pd.DataFrame(test) n_comp = 15 drop_list = [] test_drop_list = [] print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape) print('tSVD', datetime.now() - start) # tSVD tsvd = TruncatedSVD(n_components=n_comp) tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1)) tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1)) print('PCA', datetime.now() - start) # PCA pca = PCA(n_components=n_comp) pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1)) pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1)) print('ICA', datetime.now() - start) # ICA ica = FastICA(n_components=n_comp, max_iter=10000) ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1)) ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1)) print('GRP', datetime.now() - start) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1) grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1)) grp_results_test = grp.transform(test.drop(test_drop_list, axis=1)) print('SRP', datetime.now() - start) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True) srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1)) srp_results_test = srp.transform(test.drop(test_drop_list, axis=1)) # MCA # res_mca = MCA(train, ncp=n_comp, graph = FALSE) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(drop_list)) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] return train, test
def gen_features(train, val, test): train = pd.DataFrame(train) val = pd.DataFrame(val) test = pd.DataFrame(test) # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year', # 'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days', # 'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel', # 'transaction_date_year', 'transaction_date_month', 'transaction_date_date', # 'membership_expire_date_year', # 'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap', # 'cancel_times', # 'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month', # 'user_date_date'] # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']] # train[cat_cols] = train[cat_cols].astype('object') # test[cat_cols] = test[cat_cols].astype('object') # val[cat_cols] = val[cat_cols].astype('object') # # for col in cat_cols: # train[col].fillna(value=train[col].mode()[0], inplace=True) # test[col].fillna(value=test[col].mode()[0], inplace=True) # val[col].fillna(value=val[col].mode()[0], inplace=True) # for col in con_cols: # train[col].fillna(value=train[col].mean(), inplace=True) # test[col].fillna(value=test[col].mean(), inplace=True) # val[col].fillna(value=val[col].mean(), inplace=True) # # for c in train.columns: # if train[c].dtype == 'object': # lbl = LabelEncoder() # lbl.fit(list(train[c].values) + list(test[c].values)) # train[c] = lbl.transform(list(train[c].values)) # test[c] = lbl.transform(list(test[c].values)) n_comp = 15 drop_list = [] test_drop_list = [] print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape) print('tSVD', datetime.now() - start) # tSVD tsvd = TruncatedSVD(n_components=n_comp) tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1)) tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1)) tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1)) print('PCA', datetime.now() - start) # PCA pca = PCA(n_components=n_comp) pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1)) pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1)) pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1)) print('ICA', datetime.now() - start) # ICA ica = FastICA(n_components=n_comp, max_iter=10000) ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1)) ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1)) ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1)) print('GRP', datetime.now() - start) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1) grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1)) grp_results_val = grp.transform(val.drop(test_drop_list, axis=1)) grp_results_test = grp.transform(test.drop(test_drop_list, axis=1)) print('SRP', datetime.now() - start) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True) srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1)) srp_results_val = srp.transform(val.drop(test_drop_list, axis=1)) srp_results_test = srp.transform(test.drop(test_drop_list, axis=1)) # MCA # res_mca = MCA(train, ncp=n_comp, graph = FALSE) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(drop_list)) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] val['pca_' + str(i)] = pca2_results_val[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] val['ica_' + str(i)] = ica2_results_val[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] val['grp_' + str(i)] = grp_results_val[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] val['srp_' + str(i)] = srp_results_val[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] return train, val, test
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1)) pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) srp_results_test = srp.transform(test) # Append decomposition components to datasets for i in range(1, n_comp+1): train['pca_' + str(i)] = pca2_results_train[:,i-1] test['pca_' + str(i)] = pca2_results_test[:, i-1] train['ica_' + str(i)] = ica2_results_train[:,i-1] test['ica_' + str(i)] = ica2_results_test[:, i-1] train['tsvd_' + str(i)] = tsvd_results_train[:,i-1] test['tsvd_' + str(i)] = tsvd_results_test[:, i-1] train['grp_' + str(i)] = grp_results_train[:,i-1] test['grp_' + str(i)] = grp_results_test[:, i-1]
X_path = '/cs/puls/Experiments/hxiao-test/feature-data.mat' Y_path = '/cs/puls/Experiments/hxiao-test/label-data.mat' X = loadmat(X_path)['featureData'] y = loadmat(Y_path)['labelData'] RANDOM_PROJECTION_FLAG = True if RANDOM_PROJECTION_FLAG: from sklearn.random_projection import SparseRandomProjection print "Applying random projection to reduce dimension" print "Shape before: %r" % (X.shape, ) transformer = SparseRandomProjection() X = transformer.fit_transform(X) print "Shape after: %r" % (X.shape, ) # sample subset of all the data rng = np.random.RandomState(0) sample_n = 10000 rows = rng.permutation(X.shape[0])[:sample_n] X = X[rows, :] y = y[rows, :] # sample train and test train_ratio = 0.8 train_n = int(sample_n*train_ratio) rows = rng.permutation(sample_n)