class VCoder(object): def __init__(self, n_sketches, sketch_dim, input_dim): self.n_sketches = n_sketches self.sketch_dim = sketch_dim self.input_dim = input_dim self.standard_scaler = StandardScaler() if self.input_dim < 10000: self.random_projection = GaussianRandomProjection(n_components = 16*n_sketches) else: self.random_projection = SparseRandomProjection(n_components = 16*n_sketches, density = 1/3.0) def fit(self, v): self.standard_scaler = self.standard_scaler.fit(v) v = self.standard_scaler.transform(v) self.random_projection = self.random_projection.fit(v) v = self.random_projection.transform(v) self.init_biases(v) def transform(self, v): v = self.standard_scaler.transform(v) v = self.random_projection.transform(v) v = self.discretize(v) v = np.packbits(v, axis=-1) v = np.frombuffer(np.ascontiguousarray(v), dtype=np.uint16).reshape(v.shape[0], -1) % self.sketch_dim return v
def rp_experiment(X, y, name, dims): """Run Randomized Projections on specified dataset and saves reconstruction error and pairwise distance correlation results as CSV file. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (list(int)): List of component number values. """ re = defaultdict(dict) pdc = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(X) re[dim][i] = reconstruction_error(rp, X) pdc[dim][i] = pairwise_dist_corr(rp.transform(X), X) re = pd.DataFrame(pd.DataFrame(re).T.mean(axis=1)) re.rename(columns={0: 'recon_error'}, inplace=True) pdc = pd.DataFrame(pd.DataFrame(pdc).T.mean(axis=1)) pdc.rename(columns={0: 'pairwise_dc'}, inplace=True) metrics = pd.concat((re, pdc), axis=1) # save results as CSV resdir = 'results/RP' resfile = get_abspath('{}_metrics.csv'.format(name), resdir) metrics.to_csv(resfile, index_label='n')
class SparseRandomProjectionImpl(): def __init__(self, n_components='auto', density='auto', eps=0.1, dense_output=False, random_state=None): self._hyperparams = { 'n_components': n_components, 'density': density, 'eps': eps, 'dense_output': dense_output, 'random_state': random_state } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def rp(train, test, y_train, y_test): sp = SparseRandomProjection(n_components=12) X_train = sp.fit_transform(train) X_test = sp.transform(test) clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(70, ), random_state=23, shuffle=True, activation='relu', learning_rate_init=0.15, alpha=0.45) run_analysis( X_train, y_train, clf, "NN with lrate=0.15, 70 units in hidden layer, alpha 0.45, RP(12)") clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(70, ), random_state=23, shuffle=True, activation='relu', learning_rate_init=0.15, alpha=0.45) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(confusion_matrix(y_test, y_pred)) plot_confusion_matrix(y_test, y_pred, title="NN (70,) lrate=0.15, RP(12)")
def rp(X_train, X_test): num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.1) print(num_components) print("# features: ", X_train.shape[1], " JL min dim:", num_components) print("JL number > #features so cant make any JL guarentees") # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points. accuracies = [] components = np.int32(np.linspace(1, 19, 19)) model = LinearSVC() model.fit(X_train, y_train) baseline = metrics.accuracy_score(model.predict(X_test), y_test) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components=comp) X = sp.fit_transform(X_train) # train a classifier on the sparse random projection # TODO this is wrong.. needs to be KMeans model = LinearSVC(max_iter=1000) model.fit(X, y_train) # evaluate the model and update the list of accuracies test = sp.transform(X_test) accuracies.append(metrics.accuracy_score(model.predict(test), y_test)) # create the figure plt.figure() plt.title("Accuracy of Sparse Random Projection on Churn") plt.xlabel("# of Components") plt.ylabel("Accuracy") plt.xlim([1, 20]) plt.ylim([0, 1.0]) # plot the baseline and random projection accuracies plt.plot(components, [baseline] * len(accuracies), color="r") plt.plot(components, accuracies) plt.show() # average looks to be around 5 components in RP to best the baseline sp = SparseRandomProjection(n_components = 5) X_transformed = sp.fit_transform(X_train) km = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, random_state=RAND) plot_silhouette(km, X_transformed, title="SRP(5) KM(2)") km = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, random_state=RAND) plot_silhouette(km, X_transformed, title="SRP(5) KM(3)")
def random_project(weight, channel_num): A = weight.cpu().clone() A = A.view(A.size(0), -1) rp = SparseRandomProjection(n_components=channel_num * weight.size(2) * weight.size(3)) rp.fit(A) return rp.transform(A)
def engineer2(train, test): myfeats = [f for f in train.columns if f not in ['UCIC_ID','Responders']] scaler = StandardScaler() slr = scaler.fit(train[myfeats]) dim_train = slr.transform(train[myfeats]) dim_test = slr.transform(test[myfeats]) n_comp = 10 print('Starting decomposition.........\n') tsvd = TruncatedSVD(n_components=n_comp, random_state=42) tsvd_train = tsvd.fit_transform(dim_train) tsvd_test = tsvd.transform(dim_test) pca = PCA(n_components=n_comp, random_state=420) pca_train = pca.fit_transform(dim_train) pca_test = pca.transform(dim_test) ica = FastICA(n_components=n_comp, random_state=2030) ica_train = ica.fit_transform(dim_train) ica_test = ica.transform(dim_test) grp = GaussianRandomProjection(n_components=n_comp, random_state=42) grp_train = grp.fit_transform(dim_train) grp_test = grp.transform(dim_test) srp = SparseRandomProjection(n_components=n_comp, random_state=42) srp_train = srp.fit_transform(dim_train) srp_test = srp.transform(dim_test) for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca_train[:,i-1] test['pca_' + str(i)] = pca_test[:,i-1] train['tsvd_' + str(i)] = tsvd_train[:,i-1] test['tsvd_' + str(i)] = tsvd_test[:,i-1] train['ica_' + str(i)] = ica_train[:,i-1] test['ica_' + str(i)] = ica_test[:,i-1] train['grp_' + str(i)] = grp_train[:,i-1] test['grp_' + str(i)] = grp_test[:,i-1] train['srp_' + str(i)] = srp_train[:,i-1] test['srp_' + str(i)] = srp_test[:,i-1] del dim_train, dim_test return train, test
def generate(self, train, val, test, n_comps): decomposer = SparseRandomProjection(n_components=n_comps, random_state=1234) results_train = decomposer.fit_transform(train) results_val = decomposer.fit_transform(val) results_test = decomposer.transform(test) for i in range(1, n_comps + 1): train[self.featurename(i)] = results_train[:, i - 1] val[self.featurename(i)] = results_val[:, i - 1] test[self.featurename(i)] = results_test[:, i - 1] return (train, val, test)
def test_SparseRandomProjection_output_representation(): for SparseRandomProjection in all_SparseRandomProjection: # when using sparse input, the projected data can be forced to be a # dense numpy array rp = SparseRandomProjection(n_components=10, dense_output=True, random_state=0) rp.fit(data) assert isinstance(rp.transform(data), np.ndarray) sparse_data = sp.csr_matrix(data) assert isinstance(rp.transform(sparse_data), np.ndarray) # the output can be left to a sparse matrix instead rp = SparseRandomProjection(n_components=10, dense_output=False, random_state=0) rp = rp.fit(data) # output for dense input will stay dense: assert isinstance(rp.transform(data), np.ndarray) # output for sparse output will be sparse: assert sp.issparse(rp.transform(sparse_data))
def test_SparseRandomProjection_output_representation(): for SparseRandomProjection in all_SparseRandomProjection: # when using sparse input, the projected data can be forced to be a # dense numpy array rp = SparseRandomProjection(n_components=10, dense_output=True, random_state=0) rp.fit(data) assert isinstance(rp.transform(data), np.ndarray) sparse_data = sp.csr_matrix(data) assert isinstance(rp.transform(sparse_data), np.ndarray) # the output can be left to a sparse matrix instead rp = SparseRandomProjection(n_components=10, dense_output=False, random_state=0) rp = rp.fit(data) # output for dense input will stay dense: assert isinstance(rp.transform(data), np.ndarray) # output for sparse output will be sparse: assert sp.issparse(rp.transform(sparse_data))
def get_additional_features(train, test, magic=False, ID=False): col = list(test.columns) if ID != True: col.remove('ID') n_comp = 12 # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results_train = tsvd.fit_transform(train[col]) tsvd_results_test = tsvd.transform(test[col]) # PCA pca = PCA(n_components=n_comp, random_state=420) pca2_results_train = pca.fit_transform(train[col]) pca2_results_test = pca.transform(test[col]) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train[col]) ica2_results_test = ica.transform(test[col]) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train[col]) grp_results_test = grp.transform(test[col]) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train[col]) srp_results_test = srp.transform(test[col]) for i in range(1, n_comp + 1): train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] if magic == True: magic_mat = train[['ID', 'X0', 'y']] magic_mat = magic_mat.groupby(['X0'])['y'].mean() magic_mat = pd.DataFrame({ 'X0': magic_mat.index, 'magic': list(magic_mat) }) mean_magic = magic_mat['magic'].mean() train = train.merge(magic_mat, on='X0', how='left') test = test.merge(magic_mat, on='X0', how='left') test['magic'] = test['magic'].fillna(mean_magic) return train, test
def rp(train, test, y_train, y_test): model = LinearSVC() model.fit(train, y_train) baseline = metrics.accuracy_score(model.predict(X_test), y_test) accuracies = [] components = np.int32(np.linspace(2, 60, 20)) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components=comp) X = sp.fit_transform(train) # train a classifier on the sparse random projection model = LinearSVC() model.fit(X, y_train) # evaluate the model and update the list of accuracies test = sp.transform(X_test) accuracies.append(metrics.accuracy_score(model.predict(test), y_test)) # create the figure plt.figure() plt.title("Accuracy of Sparse Rand Projection on Sonar (EM, GMM)") plt.xlabel("# of Components") plt.ylabel("Accuracy") plt.xlim([2, 64]) plt.ylim([0, 1.0]) # plot the baseline and random projection accuracies plt.plot(components, [baseline] * len(accuracies), color="r") plt.plot(components, accuracies) plt.show() #random pick 30 as the best number of Random components sp = SparseRandomProjection(n_components=30) X_train = sp.fit_transform(train) gmm = mixture.GaussianMixture(2, covariance_type='full', random_state=RAND) gmm.fit(X_train) plot_silhouette(gmm, X_train, title="RP(30), GMM(2)") gmm = mixture.GaussianMixture(3, covariance_type='full', random_state=RAND) gmm.fit(X_train) plot_silhouette(gmm, X_train, title="RP(30), GMM(3)") gmm = mixture.GaussianMixture(4, covariance_type='full', random_state=RAND) gmm.fit(X_train) plot_silhouette(gmm, X_train, title="RP(30), GMM(4)")
class SparseRandomProjectionImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def reducer_rand_proj_sparse(data, params): if params is None: params = {'n_components': 5} X = data['X_train'] y = data['y_train'] reducer = SparseRandomProjection(n_components=params['n_components']) reducer.fit(X) do = deepcopy(data) do['X_train'] = reducer.transform(data['X_train']) do['X_valid'] = reducer.transform(data['X_valid']) return do
class SparseRandomProjectionSLFN(SLFN): def __init__(self, X, n_neurons, density=0.1, ufunc=np.tanh, random_state=None): self.n_neurons = n_neurons self.ufunc = ufunc self.projection = SparseRandomProjection(n_components=n_neurons, density=density, dense_output=True, random_state=random_state) self.projection.fit(X) def transform(self, X): return self.ufunc(self.projection.transform(X))
def process_file(file, model='distilbert-base-uncased', dim_reduction='auto', output_path=None): # establish conventional file names for output save_dir = pathlib.Path(output_path) if output_path else _default_output_dir vec_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.npy' dim_reducer_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.reducer.pkl' metadata_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.metadata.json' # keep track of config metadata = { 'model': model, 'source_file': file, 'embeddings_file': str(vec_outpath), # filled in later 'dim_reduction': dim_reduction, 'dim_reduction_transformer_file': str(dim_reducer_outpath) if dim_reduction else None } language_model = pipeline(task='feature-extraction', model=model) embedded_entries = [] with open(file, 'r') as f: current_line = f.readline() while len(current_line): entry = process_entry(json.loads(current_line), language_model) embedded_entries.append(entry) current_line = f.readline() entries_vec = np.stack(embedded_entries, axis=0) print(f'Processed {len(embedded_entries)} from file {file}') dim_reducer = None if dim_reduction is not None: dim_reducer = SparseRandomProjection(n_components=dim_reduction) dim_reducer.fit(entries_vec) entries_vec = dim_reducer.transform(entries_vec) # save trained dim reducer with open(str(dim_reducer_outpath), 'wb') as f_out: pickle.dump(dim_reducer, f_out) # save embeddings np.save(vec_outpath, entries_vec) # save metadata with open(str(metadata_outpath), 'w') as f_out: json.dump(metadata, f_out)
class DReduction: _N_COMP = 0 ### Number of decomposition components ### _pca = 0 _tsvd = 0 _ica = 0 _grp = 0 _srp = 0 def __init__(self, nComp): self._N_COMP = nComp self._pca = PCA(n_components=self._N_COMP, random_state=17) self._tsvd = TruncatedSVD(n_components=self._N_COMP, random_state=17) self._ica = FastICA(n_components=self._N_COMP, random_state=17) self._grp = GaussianRandomProjection(n_components=self._N_COMP, eps=0.1, random_state=17) self._srp = SparseRandomProjection(n_components=self._N_COMP, dense_output=True, random_state=17) def fit(self, X): self._pca.fit(X) self._tsvd.fit(X) self._ica.fit(X) self._grp.fit(X) self._srp.fit(X) def transform(self, X): res_pca = self._pca.transform(X) res_tsvd = self._tsvd.transform(X) res_ica = self._ica.transform(X) res_grp = self._grp.transform(X) res_srp = self._srp.transform(X) df = pd.DataFrame() for i in range(1, self._N_COMP + 1): df['pca_' + str(i)] = res_pca[:, i - 1] df['tsvd_' + str(i)] = res_tsvd[:, i - 1] df['ica_' + str(i)] = res_ica[:, i - 1] df['grp_' + str(i)] = res_grp[:, i - 1] df['srp_' + str(i)] = res_srp[:, i - 1] return df
def rp(X_train, X_test): num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.1) print(num_components) print("# features: ", X_train.shape[1], " JL min dim:", num_components) print("JL number > #features so cant make any JL guarentees") # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points. accuracies = [] components = np.int32(np.linspace(2, 64, 20)) model = LinearSVC() model.fit(X_train, y_train) baseline = metrics.accuracy_score(model.predict(X_test), y_test) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components=comp) X = sp.fit_transform(X_train) # train a classifier on the sparse random projection model = LinearSVC() model.fit(X, y_train) # evaluate the model and update the list of accuracies test = sp.transform(X_test) accuracies.append(metrics.accuracy_score(model.predict(test), y_test)) # create the figure plt.figure() plt.title("Accuracy of Sparse Projection on Sonar") plt.xlabel("# of Components") plt.ylabel("Accuracy") plt.xlim([2, 64]) plt.ylim([0, 1.0]) # plot the baseline and random projection accuracies plt.plot(components, [baseline] * len(accuracies), color="r") plt.plot(components, accuracies) plt.show()
def test_random_sparse_encoder_load(): train_data = np.random.rand(2000, input_dim) from sklearn.random_projection import SparseRandomProjection model = SparseRandomProjection(n_components=target_output_dim) filename = 'random_sparse_model.model' pickle.dump(model.fit(train_data), open(filename, 'wb')) encoder = TransformEncoder(model_path=filename) test_data = np.random.rand(10, input_dim) encoded_data = encoder.encode(test_data) transformed_data = model.transform(test_data) assert encoded_data.shape == (test_data.shape[0], target_output_dim) assert type(encoded_data) == np.ndarray np.testing.assert_almost_equal(transformed_data, encoded_data) save_and_load(encoder, False) save_and_load_config(encoder, False, train_data) rm_files([encoder.save_abspath, encoder.config_abspath, filename])
def run_rp(X, y, n_components): LOGGER.info('rp...') split_ratio = 0.33 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=0) LOGGER.debug('train test split: {}'.format(split_ratio)) model = SparseRandomProjection(n_components=X_train.shape[1], random_state=0) X_train_rp = model.fit_transform(X_train) X_test_rp = model.transform(X_test) # print(X_train.shape,X_train_rp.shape) kmeans_df, choose_df, km_model = run_kmeans(X_train_rp, X_test_rp, y_train, y_test) gm_df, gm_model = run_gm(X_train_rp, X_test_rp, y_train, y_test) return kmeans_df, gm_df, model, km_model, gm_model
model = LinearSVC() model.fit(trainData, trainTarget) baseline = metrics.accuracy_score(model.predict(testData), testTarget) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components = comp) X_new = sp.fit_transform(trainData) # train a classifier on the sparse random projection model = LinearSVC() model.fit(X_new, trainTarget) # evaluate the model and update the list of accuracies test = sp.transform(testData) accuracies.append(metrics.accuracy_score(model.predict(test), testTarget)) # create the figure plt.figure() plt.suptitle("Accuracy of Sparse Projection on Digits") plt.xlabel("# of Components") plt.ylabel("Accuracy") plt.ylim([0, 1.0]) # plot the baseline and random projection accuracies plt.plot(components, [baseline] * len(accuracies), color = "r") plt.plot(components, accuracies) plt.show()
from sklearn.decomposition import RandomizedPCA as RPCA rpca = RPCA(n_components=num_components) rpca_transformed_data_train = rpca.fit_transform(dense_trainData) rpca_transformed_data_valid = rpca.transform(dense_validData) # Perform Gaussian Random Projection from sklearn.random_projection import GaussianRandomProjection as GaussRan grp = GaussRan(n_components=num_components) grp_transformed_data_train = grp.fit_transform(dense_trainData) grp_transformed_data_valid = grp.transform(dense_validData) # Perform Sparse Random Projection from sklearn.random_projection import SparseRandomProjection as SparseRan srp = SparseRan(n_components=num_components, random_state=0) srp_transformed_data_train = srp.fit_transform(dense_trainData) srp_transformed_data_valid = srp.transform(dense_validData) # Perform classification using 1-Nearest Neighbor Classifier from sklearn.neighbors import KNeighborsClassifier # Create a subset grid to plot performance against numbers of components tsvd_max = tsvd_transformed_data_train.shape[1] plot_subset = [] length_of_plot_subset = len(plot_subset) if tsvd_max < 101: spacing = super_fine_spacing plot_subset = [] for j in arange(1, spacing - 1): plot_subset.append(j) quotient = tsvd_max / spacing for j in arange(1, quotient + 1):
def DecomposedFeatures(train, test, val, total, addtrain, addtest, use_pca = 0.0, use_tsvd = 0.0, use_ica = 0.0, use_fa = 0.0, use_grp = 0.0, use_srp = 0.0, use_KPCA = 0.0, kernal="rbf"): print("\nStart decomposition process...") train_decomposed = [] test_decomposed = [] val_decomposed = [] if addtrain is not None: train_decomposed = [addtrain] val_decomposed= [val] if addtest is not None: test_decomposed = [addtest] if use_pca>0.0: print("PCA") N_COMP = int(use_pca * train.shape[1]) +1 pca = PCA(n_components = N_COMP, whiten=True, svd_solver="full", random_state = 42) pca_results = pca.fit(total) pca_results_train = pca.transform(train) pca_results_test = pca.transform(test) pca_results_val = pca.transform(val) train_decomposed.append(pca_results_train) test_decomposed.append(pca_results_test) val_decomposed.append(pca_results_val) if use_tsvd>0.0: print("tSVD") N_COMP = int(use_tsvd * train.shape[1]) +1 tsvd = TruncatedSVD(n_components = N_COMP, random_state=42) tsvd_results = tsvd.fit(total) tsvd_results_train = tsvd.transform(train) tsvd_results_test = tsvd.transform(test) tsvd_results_val = tsvd.transform(val) train_decomposed.append(tsvd_results_train) test_decomposed.append(tsvd_results_test) val_decomposed.append(tsvd_results_val) if use_ica>0.0: print("ICA") N_COMP = int(use_ica * train.shape[1]) +1 ica = FastICA(n_components = N_COMP, random_state=42) ica_results = ica.fit(total) ica_results_train = ica.transform(train) ica_results_test = ica.transform(test) ica_results_val = ica.transform(val) train_decomposed.append(ica_results_train) test_decomposed.append(ica_results_test) val_decomposed.append(ica_results_val) if use_fa>0.0: print("FA") N_COMP = int(use_fa * train.shape[1]) +1 fa = FactorAnalysis(n_components = N_COMP, random_state=42) fa_results = fa.fit(total) fa_results_train = fa.transform(train) fa_results_test = fa.transform(test) fa_results_val = fa.transform(val) train_decomposed.append(fa_results_train) test_decomposed.append(fa_results_test) val_decomposed.append(fa_results_val) if use_grp>0.0 or use_grp<0.0: print("GRP") if use_grp>0.0: N_COMP = int(use_grp * train.shape[1]) +1 eps=10 if use_grp<0.0: N_COMP = "auto" eps=abs(use_grp) grp = GaussianRandomProjection(n_components = N_COMP, eps=eps, random_state=42) grp_results = grp.fit(total) grp_results_train = grp.transform(train) grp_results_test = grp.transform(test) grp_results_val = grp.transform(val) train_decomposed.append(grp_results_train) test_decomposed.append(grp_results_test) val_decomposed.append(grp_results_val) if use_srp>0.0: print("SRP") N_COMP = int(use_srp * train.shape[1]) +1 srp = SparseRandomProjection(n_components = N_COMP, dense_output=True, random_state=42) srp_results = srp.fit(total) srp_results_train = srp.transform(train) srp_results_test = srp.transform(test) srp_results_val = pca.transform(val) train_decomposed.append(srp_results_train) test_decomposed.append(srp_results_test) val_decomposed.append(srp_results_val) if use_KPCA >0.0: print("KPCA") N_COMP = int(use_KPCA * train.shape[1]) +1 #N_COMP = None pls = KernelPCA(n_components = N_COMP,kernel=kernal) pls_results = pls.fit(total) pls_results_train = pls.transform(train) pls_results_test = pls.transform(test) pls_results_val = pls.transform(val) train_decomposed.append(pls_results_train) test_decomposed.append(pls_results_test) val_decomposed.append(pls_results_val) gc.collect() print("Append decomposition components together...") train_decomposed = np.concatenate(train_decomposed, axis=1) test_decomposed = np.concatenate( test_decomposed, axis=1) val_decomposed = np.concatenate( val_decomposed, axis=1) train_with_only_decomposed_features = pd.DataFrame(train_decomposed) test_with_only_decomposed_features = pd.DataFrame(test_decomposed) val_with_only_decomposed_features = pd.DataFrame(val_decomposed) #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']: # train_with_only_decomposed_features[col] = train[col] # test_with_only_decomposed_features[col] = test[col] # Remove any NA train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(0) test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(0) val_with_only_decomposed_features = val_with_only_decomposed_features.fillna(0) return train_with_only_decomposed_features, test_with_only_decomposed_features, val_with_only_decomposed_features
def gen_feature(train, test): train = pd.DataFrame(train) test = pd.DataFrame(test) n_comp = 15 drop_list = [] test_drop_list = [] print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape) print('tSVD', datetime.now() - start) # tSVD tsvd = TruncatedSVD(n_components=n_comp) tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1)) tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1)) print('PCA', datetime.now() - start) # PCA pca = PCA(n_components=n_comp) pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1)) pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1)) print('ICA', datetime.now() - start) # ICA ica = FastICA(n_components=n_comp, max_iter=10000) ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1)) ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1)) print('GRP', datetime.now() - start) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1) grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1)) grp_results_test = grp.transform(test.drop(test_drop_list, axis=1)) print('SRP', datetime.now() - start) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True) srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1)) srp_results_test = srp.transform(test.drop(test_drop_list, axis=1)) # MCA # res_mca = MCA(train, ncp=n_comp, graph = FALSE) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(drop_list)) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] return train, test
def gen_features(train, val, test): train = pd.DataFrame(train) val = pd.DataFrame(val) test = pd.DataFrame(test) # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year', # 'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days', # 'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel', # 'transaction_date_year', 'transaction_date_month', 'transaction_date_date', # 'membership_expire_date_year', # 'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap', # 'cancel_times', # 'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month', # 'user_date_date'] # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']] # train[cat_cols] = train[cat_cols].astype('object') # test[cat_cols] = test[cat_cols].astype('object') # val[cat_cols] = val[cat_cols].astype('object') # # for col in cat_cols: # train[col].fillna(value=train[col].mode()[0], inplace=True) # test[col].fillna(value=test[col].mode()[0], inplace=True) # val[col].fillna(value=val[col].mode()[0], inplace=True) # for col in con_cols: # train[col].fillna(value=train[col].mean(), inplace=True) # test[col].fillna(value=test[col].mean(), inplace=True) # val[col].fillna(value=val[col].mean(), inplace=True) # # for c in train.columns: # if train[c].dtype == 'object': # lbl = LabelEncoder() # lbl.fit(list(train[c].values) + list(test[c].values)) # train[c] = lbl.transform(list(train[c].values)) # test[c] = lbl.transform(list(test[c].values)) n_comp = 15 drop_list = [] test_drop_list = [] print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape) print('tSVD', datetime.now() - start) # tSVD tsvd = TruncatedSVD(n_components=n_comp) tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1)) tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1)) tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1)) print('PCA', datetime.now() - start) # PCA pca = PCA(n_components=n_comp) pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1)) pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1)) pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1)) print('ICA', datetime.now() - start) # ICA ica = FastICA(n_components=n_comp, max_iter=10000) ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1)) ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1)) ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1)) print('GRP', datetime.now() - start) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1) grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1)) grp_results_val = grp.transform(val.drop(test_drop_list, axis=1)) grp_results_test = grp.transform(test.drop(test_drop_list, axis=1)) print('SRP', datetime.now() - start) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True) srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1)) srp_results_val = srp.transform(val.drop(test_drop_list, axis=1)) srp_results_test = srp.transform(test.drop(test_drop_list, axis=1)) # MCA # res_mca = MCA(train, ncp=n_comp, graph = FALSE) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(drop_list)) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] val['pca_' + str(i)] = pca2_results_val[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] val['ica_' + str(i)] = ica2_results_val[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] val['grp_' + str(i)] = grp_results_val[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] val['srp_' + str(i)] = srp_results_val[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] return train, val, test
def transform(self, train, test): print('Converting categorical data') # Convert categorical data for c in train.columns: if train[c].dtype == 'object': lbl = LabelEncoder() lbl.fit(list(train[c].values) + list(test[c].values)) train[c] = lbl.transform(list(train[c].values)) test[c] = lbl.transform(list(test[c].values)) # Remove the outlier print('Removing outlier') train = train[train['y'] < 250] col = list(test.columns) if not self.keepID: col.remove('ID') # tSVD print('Generating tSVD components') tsvd = TruncatedSVD(n_components=self.N_COMP) tsvd_results_train = tsvd.fit_transform(train[col]) tsvd_results_test = tsvd.transform(test[col]) # PCA print('Generating PCA components') pca = PCA(n_components=self.N_COMP) pca_results_train = pca.fit_transform(train[col]) pca_results_test = pca.transform(test[col]) # ICA print('Generating ICA components') ica = FastICA(n_components=self.N_COMP) ica_results_train = ica.fit_transform(train[col]) ica_results_test = ica.transform(test[col]) # GRP print('Generating GRP components') grp = GaussianRandomProjection(n_components=self.N_COMP, eps=0.1) grp_results_train = grp.fit_transform(train[col]) grp_results_test = grp.transform(test[col]) # SRP print('Generating SRP components') srp = SparseRandomProjection(n_components=self.N_COMP, dense_output=True) srp_results_train = srp.fit_transform(train[col]) srp_results_test = srp.transform(test[col]) print('Appending generated components') for i in range(1, self.N_COMP + 1): train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['pca_' + str(i)] = pca_results_train[:, i - 1] test['pca_' + str(i)] = pca_results_test[:, i - 1] train['ica_' + str(i)] = ica_results_train[:, i - 1] test['ica_' + str(i)] = ica_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] print('Appending magic features') if self.magicFeature: magic_mat = train[['ID', 'X0', 'y']] magic_mat = magic_mat.groupby(['X0'])['y'].mean() magic_mat = pd.DataFrame({ 'X0': magic_mat.index, 'magic': list(magic_mat) }) mean_magic = magic_mat['magic'].mean() train = train.merge(magic_mat, on='X0', how='left') test = test.merge(magic_mat, on='X0', how='left') test['magic'] = test['magic'].fillna(mean_magic) # Shuffle the data print('Shuffling data') train = train.sample(frac=1) return train, test
def perform_feature_engineering(train, test, config): for c in train.columns: if len(train[c].value_counts()) == 2: if train[c].mean() < config['SparseThreshold']: del train[c] del test[c] col = list(test.columns) if config['ID'] != True: col.remove('ID') # tSVD if config['tSVD'] == True: tsvd = TruncatedSVD(n_components=config['n_comp']) tsvd_results_train = tsvd.fit_transform(train[col]) tsvd_results_test = tsvd.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] # PCA if config['PCA'] == True: pca = PCA(n_components=config['n_comp']) pca2_results_train = pca.fit_transform(train[col]) pca2_results_test = pca.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] # ICA if config['ICA'] == True: ica = FastICA(n_components=config['n_comp']) ica2_results_train = ica.fit_transform(train[col]) ica2_results_test = ica.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] # GRP if config['GRP'] == True: grp = GaussianRandomProjection(n_components=config['n_comp'], eps=0.1) grp_results_train = grp.fit_transform(train[col]) grp_results_test = grp.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] # SRP if config['SRP'] == True: srp = SparseRandomProjection(n_components=config['n_comp'], dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train[col]) srp_results_test = srp.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] if config['magic'] == True: magic_mat = train[['ID', 'X0', 'y']] magic_mat = magic_mat.groupby(['X0'])['y'].mean() magic_mat = pd.DataFrame({ 'X0': magic_mat.index, 'magic': list(magic_mat) }) mean_magic = magic_mat['magic'].mean() train = train.merge(magic_mat, on='X0', how='left') test = test.merge(magic_mat, on='X0', how='left') test['magic'] = test['magic'].fillna(mean_magic) return train, test
print(target_names) print(dataset.images.shape) print(dataset.data.shape) print(dataset.target.shape) print(H * W) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) from sklearn.random_projection import SparseRandomProjection n_components = 80 decomposer = SparseRandomProjection(n_components=n_components).fit(X_train) X_train_d = decomposer.transform(X_train) X_test_d = decomposer.transform(X_test) from sklearn.neural_network import MLPClassifier model = MLPClassifier(hidden_layer_sizes=(1024, ), batch_size=256, verbose=True, early_stopping=True) model.fit(X_train_d, y_train) y_pred = model.predict(X_test_d) from sklearn.metrics import classification_report print(classification_report(y_test, y_pred, target_names=target_names)) idx = np.random.randint(0, len(y_pred))
# In[44]: ids = test.reset_index()['ID'] # In[45]: from sklearn.decomposition import FactorAnalysis from sklearn.random_projection import GaussianRandomProjection from sklearn.random_projection import SparseRandomProjection X_fa = fa.transform(test) X_srp = srp.transform(test) X_grp = grp.transform(test) X_added = pd.concat([ pd.DataFrame(X_fa), pd.DataFrame(X_srp), pd.DataFrame(X_grp), ], axis=1) y_pred = gbm.predict(X_added) y_pred # In[46]:
def select_features_SparseRandomProjections(train_X, train_y, test_X, k): selector = SparseRandomProjection(n_components=k, random_state=42) selector.fit(train_X) train_X = selector.transform(train_X) test_X = selector.transform(test_X) return train_X, test_X
pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) srp_results_test = srp.transform(test) # Append decomposition components to datasets for i in range(1, n_comp+1): train['pca_' + str(i)] = pca2_results_train[:,i-1] test['pca_' + str(i)] = pca2_results_test[:, i-1] train['ica_' + str(i)] = ica2_results_train[:,i-1] test['ica_' + str(i)] = ica2_results_test[:, i-1] train['tsvd_' + str(i)] = tsvd_results_train[:,i-1] test['tsvd_' + str(i)] = tsvd_results_test[:, i-1] train['grp_' + str(i)] = grp_results_train[:,i-1] test['grp_' + str(i)] = grp_results_test[:, i-1]
def projection(self): """Performs sparse random projection to reduce dimensionality of data""" transformer = SparseRandomProjection() train_new = transformer.fit_transform(self.train) test_new = transformer.transform(self.test) return train_new, test_new
def demo(): colors = [ 'r', 'g', 'b', 'o', 'y', 'lightgreen', 'cyan', 'pink', 'violet', 'brown' ] digits = datasets.load_digits() n, original_dimension = digits.data.shape accuracies = [] components = np.int32(np.linspace(2, 64, 20)) print() print("=" * 40) print("The number of observation:", n) print("Dimensional of original data:", original_dimension) print("Dimensional of new data:", components) print("=" * 40) # SVM split = train_test_split(digits.data, digits.target, test_size=0.3, random_state=42) (trainData, testData, trainTarget, testTarget) = split model = LinearSVC() model.fit(trainData, trainTarget) baseline = metrics.accuracy_score(model.predict(testData), testTarget) print("Baseline accuracy:", baseline) # johnson_lindenstrauss_min_dim(N,eps=0.1) print("Random projection accuracies") ct = 0 ct_color = 0 # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components=comp) X = sp.fit_transform(trainData) # train a classifier on the sparse random projection model = LinearSVC() model.fit(X, trainTarget) # evaluate the model and update the list of accuracies test = sp.transform(testData) if ct % 4 == 0: c = colors[ct_color] plt.scatter(range(1, comp + 1), test[0], marker='o', cmap=c) # plt.scatter(range(1,comp+1),testTarget[:comp],marker='1',cmap=c) ct_color += 1 ct += 1 acc = metrics.accuracy_score(model.predict(test), testTarget) accuracies.append(acc) print(comp, ":", acc) # create the figure plt.figure() plt.suptitle("Accuracy of Sparse Projection on Digits") plt.xlabel("# of Components") plt.ylabel("Accuracy") plt.xlim([2, 64]) plt.ylim([0, 1.0]) # plot the baseline and random projection accuracies plt.plot(components, [baseline] * len(accuracies), color="r") plt.plot(components, accuracies) plt.show()
n_components = 'auto' density = 'auto' eps = 0.5 dense_output = False random_state = 2018 SRP = SparseRandomProjection(n_components=n_components, density=density, eps=eps, dense_output=dense_output, random_state=random_state) X_train_SRP = SRP.fit_transform(X_train) X_train_SRP = pd.DataFrame(data=X_train_SRP, index=train_index) X_validation_SRP = SRP.transform(X_validation) X_validation_SRP = pd.DataFrame(data=X_validation_SRP, index=validation_index) scatterPlot(X_train_SRP, y_train, "Sparse Random Projection") # In[ ]: # Isomap from sklearn.manifold import Isomap n_neighbors = 5 n_components = 10 n_jobs = 4 isomap = Isomap(n_neighbors=n_neighbors,
pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_ica, random_state=42,max_iter=1000, tol=.008) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_grp, eps=0.1, random_state=42) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_srp, dense_output=True, random_state=42) srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) srp_results_test = srp.transform(test) # Append decomposition components to datasets print("Append PCA components to datasets...") for i in range(1, n_pca + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] print("Append ICA components to datasets...") for i in range(1, n_ica + 1): train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] #print("Append NMF components to datasets...") #for i in range(1, n_nmf + 1): # train['nmf_' + str(i)] = nmf2_results_train[:, i - 1]
def DecomposedFeatures(train, test, total, addtrain, addtest, use_pca=0.0, use_tsvd=0.0, use_ica=0.0, use_fa=0.0, use_grp=0.0, use_srp=0.0, use_pls=0.0): print("\nStart decomposition process...") train_decomposed = [addtrain] test_decomposed = [addtest] if use_pca > 0.0: print("PCA") N_COMP = int(use_pca * train.shape[1]) + 1 pca = PCA(n_components=N_COMP, whiten=True, svd_solver="full", random_state=42) pca_results = pca.fit(total) pca_results_train = pca.transform(train) pca_results_test = pca.transform(test) train_decomposed = train_decomposed.append(pca_results_train) test_decomposed = test_decomposed.append(pca_results_test) if use_tsvd > 0.0: print("tSVD") N_COMP = int(use_tsvd * train.shape[1]) + 1 tsvd = TruncatedSVD(n_components=N_COMP, random_state=42) tsvd_results = tsvd.fit(total) tsvd_results_train = tsvd.transform(train) tsvd_results_test = tsvd.transform(test) train_decomposed = train_decomposed.append(tsvd_results_train) test_decomposed = test_decomposed.append(tsvd_results_test) if use_ica > 0.0: print("ICA") N_COMP = int(use_ica * train.shape[1]) + 1 ica = FastICA(n_components=N_COMP, random_state=42) ica_results = ica.fit(total) ica_results_train = ica.transform(train) ica_results_test = ica.transform(test) train_decomposed = train_decomposed.append(train_decomposed) test_decomposed = test_decomposed.append(ica_results_test) if use_fa > 0.0: print("FA") N_COMP = int(use_fa * train.shape[1]) + 1 fa = FactorAnalysis(n_components=N_COMP, random_state=42) fa_results = fa.fit(total) fa_results_train = fa.transform(train) fa_results_test = fa.transform(test) train_decomposed = train_decomposed.append(fa_results_train) test_decomposed = test_decomposed.append(fa_results_test) if use_grp > 0.0: print("GRP") N_COMP = int(use_grp * train.shape[1]) + 1 grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=42) grp_results = grp.fit(total) grp_results_train = grp.transform(train) grp_results_test = grp.transform(test) train_decomposed = train_decomposed.append(grp_results_train) test_decomposed = test_decomposed.append(grp_results_test) if use_srp > 0.0: print("SRP") N_COMP = int(use_srp * train.shape[1]) + 1 srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=42) srp_results = srp.fit(total) srp_results_train = srp.transform(train) srp_results_test = srp.transform(test) train_decomposed = train_decomposed.append(srp_results_train) test_decomposed = test_decomposed.append(srp_results_test) if use_pls > 0.0: print("PLS") #N_COMP = int(use_pls * train.shape[1]) +1 #pls = PLSCanonical(n_components = N_COMP) #pls_results = pls.fit(total) #pls_results_train = pls.transform(train) #pls_results_test = pls.transform(test) #train_decomposed = np.concatenate([pls_results_train,train_decomposed], axis=1) #test_decomposed = np.concatenate([pls_results_test, test_decomposed], axis=1) print("Append decomposition components together...") train_decomposed = np.concatenate(train_decomposed, axis=1) test_decomposed = np.concatenate(test_decomposed, axis=1) train_with_only_decomposed_features = pd.DataFrame(train_decomposed) test_with_only_decomposed_features = pd.DataFrame(test_decomposed) #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']: # train_with_only_decomposed_features[col] = train[col] # test_with_only_decomposed_features[col] = test[col] np.concatenate([ srp_results_train, grp_results_train, ica_results_train, pca_results_train, tsvd_results_train ], axis=1) # Remove any NA train_with_only_decomposed_features = train_with_only_decomposed_features.fillna( 0) test_with_only_decomposed_features = test_with_only_decomposed_features.fillna( 0) return train_with_only_decomposed_features, test_with_only_decomposed_features