def test_fastica_simple(add_noise, seed): # Test the FastICA algorithm on very simple data. rng = np.random.RandomState(seed) # scipy.stats uses the global RNG: n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 s2 = stats.t.rvs(1, size=n_samples) s = np.c_[s1, s2].T center_and_norm(s) s1, s2 = s # Mixing angle phi = 0.6 mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(2, 1000) center_and_norm(m) # function as fun arg def g_test(x): return x**3, (3 * x**2).mean(axis=-1) algos = ['parallel', 'deflation'] nls = ['logcosh', 'exp', 'cube', g_test] whitening = [True, False] for algo, nl, whiten in itertools.product(algos, nls, whitening): if whiten: k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo, random_state=rng) with pytest.raises(ValueError): fastica(m.T, fun=np.tanh, algorithm=algo) else: pca = PCA(n_components=2, whiten=True, random_state=rng) X = pca.fit_transform(m.T) k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False, random_state=rng) with pytest.raises(ValueError): fastica(X, fun=np.tanh, algorithm=algo) s_ = s_.T # Check that the mixing model described in the docstring holds: if whiten: assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m)) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if not add_noise: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2) else: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=seed) ica = FastICA(fun=nl, algorithm=algo, random_state=seed) sources = ica.fit_transform(m.T) assert ica.components_.shape == (2, 2) assert sources.shape == (1000, 2) assert_array_almost_equal(sources_fun, sources) assert_array_almost_equal(sources, ica.transform(m.T)) assert ica.mixing_.shape == (2, 2) for fn in [np.tanh, "exp(-.5(x^2))"]: ica = FastICA(fun=fn, algorithm=algo) with pytest.raises(ValueError): ica.fit(m.T) with pytest.raises(TypeError): FastICA(fun=range(10)).fit(m.T)
############################################################ # Add decomposition feature ############################################################ n_comp = 12 # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results = tsvd.fit_transform(train_test_p) # PCA pca = PCA(n_components=n_comp, random_state=420) pca_results = pca.fit_transform(train_test_p) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica_results = ica.fit_transform(train_test_p) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results = grp.fit_transform(train_test_p) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results = srp.fit_transform(train_test_p) # save columns list before adding the decomposition components usable_columns = train_test_p.columns print train_test_p.shape print tsvd_results.shape, type(tsvd_results)
clusters = [2, 5, 10, 15, 20, 25, 30, 35] dim = [2, 4, 6, 8, 9] km = KM(random_state=42) gmm = GMM(random_state=42) Score = defaultdict(list) adjMI = defaultdict(list) S_homog = defaultdict(list) S_adjMI = defaultdict(list) S_vm = defaultdict(list) for i in dim: reduced_X = FastICA(n_components=i, random_state=42).fit_transform(X_scaled) k = 10 km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(reduced_X) gmm.fit(reduced_X) Score['km'].append(km.score(reduced_X)) Score['gmm'].append(gmm.score(reduced_X)) S_homog['km'].append( metrics.homogeneity_score(labels, km.predict(reduced_X))) S_homog['gmm'].append( metrics.homogeneity_score(labels, gmm.predict(reduced_X))) S_adjMI['km'].append( metrics.adjusted_mutual_info_score(labels, km.predict(reduced_X))) S_adjMI['gmm'].append( metrics.adjusted_mutual_info_score(labels, gmm.predict(reduced_X)))
ADA_ICA_Valid = [] ADA_ICA_Valid_STD = [] ADA_ICA_Test = [] NN_PCA_Valid = [] NN_PCA_Valid_STD = [] NN_PCA_Test = [] NN_ICA_Valid = [] NN_ICA_Valid_STD = [] NN_ICA_Test = [] for comp in Comp_space: pca = PCA(n_components=comp, whiten=False) ica = FastICA(n_components=comp, whiten=True, max_iter=1000000) print("PCA Fit...") pca.fit(Data.features_train[:, 5:]) print("ICA Fit...") ica.fit(Data.features_train[:, 5:]) X_train_pca = pca.transform(Data.features_train[:, 5:]) X_test_pca = pca.transform(Data.features_test[:, 5:]) train_pca = np.hstack((X_train_pca, Data.features_train[:, 0:5])) test_pca = np.hstack((X_test_pca, Data.features_test[:, 0:5])) X_train_ica = ica.transform(Data.features_train[:, 5:]) X_test_ica = ica.transform(Data.features_test[:, 5:])
warnings.warn('nilearn must be installed to run CanICA') canica_dmn = nibabel.load(join(path, 'canica.nii.gz')).get_data()[..., 4] ### Melodic ICA ############################################################ # To have MELODIC results, please use my melodic branch of nilearn melodic_dmn = nibabel.load(join(path, 'melodic.nii.gz')).get_data()[..., 3] ### FastICA ################################################################## # Concatenate all the subjects if not exists(join(path, 'ica.nii.gz')): from sklearn.decomposition import FastICA X = np.vstack(X) ica = FastICA(n_components=n_components, random_state=2) t0 = time.time() ica.fit(X) print('FastICA: %f' % (time.time() - t0)) ica_components = masking.unmask(ica.components_, mask_img) nibabel.save(nibabel.Nifti1Image(ica_components, mask_img.get_affine()), join(path, 'ica.nii.gz')) ica_dmn = -nibabel.load(join(path, 'ica.nii.gz')).get_data()[..., 1] ### Plots #################################################################### # Split the sign to harmonize maps ica_dmn = -ica_dmn canica_dmn = -canica_dmn
def dimensionality_ICA(instruction, dataset, target="", y=""): global counter dataReader = DataReader(dataset) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data, y, target, full_pipeline = initial_preprocesser(data, instruction, True, 0.2, [], 0.2, random_state=49) X_train = data['train'] X_test = data['test'] y_train = y['train'] y_test = y['test'] pca = FastICA(n_components=len(X_train.columns)) X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.fit_transform(X_test) clf = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train) acc = [] sets = [] acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test)) frame = pd.DataFrame( pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod))) frame[target] = np.r_[y_train, y_test] sets.append(frame) for i in range(2, len(X_train.columns)): pca = FastICA(n_components=i) X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.fit_transform(X_test) frame = pd.DataFrame( pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod))) frame[target] = np.r_[y_train, y_test] sets.append(frame) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train) acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test)) del i data_modified = sets[acc.index(max(acc))] score = max(acc) return data_modified, score, ((len(X_train.columns) + 1) - len(data_modified.columns))
from keras.models import Sequential from keras.layers import Dense import pandas as pd from sklearn.model_selection import train_test_split from sklearn.decomposition import FastICA ica = FastICA(n_components=5) base = pd.read_csv('cardio_train.csv') caracteristicas = ['age', 'gender', 'height','weight','ap_hi','ap_lo','cholesterol','gluc','smoke','alco','active'] age =[] for i in base.age.values: i = i//365 age.append(i) base['age'] = age x = base[caracteristicas].values y = base.cardio.values x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.3, random_state=1) x_treino = ica.fit_transform(x_treino) x_teste = ica.transform(x_teste) model = Sequential() model.add(Dense(5, input_dim=5, kernel_initializer='normal', activation='relu')) model.add(Dense(1, kernel_initializer='normal', activation='sigmoid')) # Compile model. We use the the logarithmic loss function, and the Adam gradient optimizer. model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_treino, y_treino, epochs=100, batch_size=32,verbose=1) loss_and_metrics = model.evaluate(x_teste, y_teste, batch_size=128)
def fast_ica_transform(fitting_inputs_scaled): ica = FastICA() ica.fit(fitting_inputs_scaled) return ica.transform(fitting_inputs_scaled)
train_inv_pca = np.zeros((n_images,INPUT_SIZE*INPUT_SIZE,3)) # variable inverting pca features for visualization x_train_ica = np.zeros((n_images,NCOMPONENTS_ICA,3)) # variable containing ica features train_inv_ica = np.zeros((n_images,INPUT_SIZE*INPUT_SIZE,3)) # variable inverting ica features for visualization x_train_canny = np.zeros((n_images,INPUT_SIZE,INPUT_SIZE,3)) # variable containing edges detected by canny edge detector x_train_cornerharris = np.zeros((n_images,INPUT_SIZE,INPUT_SIZE,3)) for i in range(CHANNELS): if USE_PCA: pca = PCA(n_components=NCOMPONENTS_PCA) x_tr = np.reshape(x_train[:, :, :, i], (n_images, INPUT_SIZE*INPUT_SIZE)) x_train_pca[:, :, i] = pca.fit_transform(x_tr) print(pca.explained_variance_ratio_) inv_pca = pca.inverse_transform(x_train_pca[:, :, i]) train_inv_pca[:,:,i] = inv_pca if USE_ICA: ica = FastICA(n_components=NCOMPONENTS_ICA) x_tr = np.reshape(x_train[:, :, :, i], (n_images, INPUT_SIZE * INPUT_SIZE)) x_train_ica[:, :, i] = ica.fit_transform(x_tr) print(pca.explained_variance_ratio_) inv_ica = ica.inverse_transform(x_train_ica[:, :, i]) train_inv_ica[:, :, i] = inv_ica if USE_GENSEL: # Genetic feature selection: #very slow clsfr = linear_model.LogisticRegression() gs = GeneticSelectionCV(clsfr) print(np.shape(x_tr)) print(np.shape(y_train)) print(np.array(np.argmax(y_train, axis=1))) gs_features = gs.fit(x_tr, np.array(np.argmax(y_train, axis=1)))
def main(): async_dict = { "dbl": get_binary_target_basic_dbl_trigger_data, "dbl_deriv_valid": get_binary_target_deriv_valid_dbl_trigger_data, "dbl_deriv_valid_retro": get_binary_target_retrospective_dbl_trigger_data, "fa": get_binary_target_fa_patient_data, "bs": get_binary_target_bs_patient_data, "bs_deriv_valid": get_binary_target_deriv_valid_bs_patient_data, "co": get_binary_target_cough_data, "su": get_binary_target_suction_data, "multi": get_multi_target_derivation_cohort_data, "multi_retro": get_multi_target_retrospective_data, "multi_retro_first_dbl_bs": get_multi_target_retrospective_data_first_dbl_as_bs, "multi_retro_fdb_deriv_val": get_multi_retrospective_data_fdb_deriv_val_cohort, "bs_dbl": get_bs_dbl_non_retrospective_data, "bs_dbl_retro": get_bs_dbl_retrospective_data, "bs_dbl_retro_fdb_deriv_val": get_bs_dbl_retrospective_first_dbl_as_bs_deriv_valid, "bs_dbl_retro_fdn_deriv_val": get_bs_dbl_retrospective_first_dbl_as_norm_deriv_valid, "bs_dbl_retro_fdb_deriv_cohort": get_bs_dbl_retrospective_data_first_dbl_as_bs, "bs_dbl_retro_fdb_val_cohort": get_bs_dbl_retrospective_data_first_dbl_as_bs_val_cohort, "multi_binary_retro": get_multi_class_binary_label_retrospective_data, } feature_dict = { "slope": get_slopes_of_pressure_curve_df, "integ": get_integrated_pressure_curve_df, "v1": get_v1, "v2": get_v2, "settings": get_vent_settings_features, "derived": get_derived_metadata_features, "metadata": get_settings_and_derived_features, "v2_and_metadata": get_v2_and_metadata, "fa_heuristic": get_fa_heuristic, "dbl_heuristic": get_dbl_trigger_heuristic_features, "dbl_all": get_dbl_trigger_heuristic_and_metadata_features, "dbl_chi2": get_dbl_chi2, "dbl_retro": get_dbl_trigger_retrospective_plus_metadata_features, "dbl_retro_chi2": get_dbl_retro_chi2, "dbl_no_retro_curated": get_dbl_curated, "bs_heuristic": get_bs_heuristic, "bs_all": get_bs_all, "bs_chi2": get_bs_chi2, "bs_curated": get_bs_curated, "co_heuristic": get_co_heuristic, "co_all": get_co_all, "co_curated": get_co_curated_no_tvi, "co_curated_with_tvi": get_co_curated_with_tvi, "su_heuristic": get_suction_heuristic, "su_all": get_suction_all, "su_curated": get_suction_curated, "retro_fused_plus_metadata": get_retro_plus_metadata, "greg_selection": get_all_greg_selected_features, "retro_non_noisy": get_retro_non_noisy, "retro_prev_plus_metadata": get_retro_prev_plus_metadata, "retro_prev_prev_plus_metadata": get_retro_prev_prev_plus_metadata, "retro_stripped_expert_plus_chi2": get_retro_stripped_expert_plus_chi2, "retro_stripped_expert_plus_chi2_2": get_retro_stripped_expert_plus_chi2_2, "retro_stripped_expert_plus_chi2_3": get_retro_stripped_expert_plus_chi2_3, "retro_stripped_expert_plus_chi2_4": get_retro_stripped_expert_plus_chi2_4, # this is currently highest performer "retro_stripped_expert_plus_chi2_5": get_retro_stripped_expert_plus_chi2_5, "retro_stripped_expert_plus_chi2_6": get_retro_stripped_expert_plus_chi2_6, "retro_stripped_expert_plus_chi2_7": get_retro_stripped_expert_plus_chi2_7, "retro_stripped_expert_plus_instrr": get_retro_stripped_expert_plus_instrr, "retro_stripped_expert_plus_instrr_prev": get_retro_stripped_expert_plus_instrr, "retro_stripped_low_prec": get_retro_stripped_lower_prec, "retro_stripped_high_prec": get_retro_stripped_higher_prec, } parser = build_parser(async_dict, feature_dict) args = parser.parse_args() additional_error_handling(args) feature_func = feature_dict.get(args.feature_type) gold_stnd_func = async_dict.get(args.async_type) x, y, extra_info = get_x_y(feature_func, args.bins, args.pickle_file, args.new_pickling_file, gold_stnd_func, args.new_csv_file) generator = args.split_func(x, y, args) results = [] for x_train, x_test, y_train, y_test in generator: if args.only_patient: if args.only_patient not in x_test.patient.unique(): continue else: x_test = x_test[x_test.patient == args.only_patient] try: del x_train['patient'] except: pass try: del x_test['patient'] except: pass x_train = perform_space_replacement(x_train) if x_test != []: x_test = perform_space_replacement(x_test) if args.selected_features: x_train = x_train[args.selected_features] x_test = x_test[args.selected_features] # I guess I only wanted to winsorize two vars? Maybe I wanted to reduce # side effects winsorizor = Winsorizor(args.winsorize) x_train = winsorizor.fit_transform( x_train, ['tve:tvi-ratio', 'tve:tvi-ratio-prev']) x_test = winsorizor.transform(x_test) scaler = ScalerWrapper(args.scaler, None, x_train, x_test) classifier = Classifier(args, scaler) x_train = scaler.train_transform() x_test = scaler.test_transform() x_train, x_test = perform_pca(x_train, y_train, x_test, y_test, args.pca) if args.lda: lda = LinearDiscriminantAnalysis(solver='svd') cols = x_train.columns train_index, test_index = x_train.index, x_test.index lda.fit(x_train.values, y_train.values) x_train = DataFrame(lda.transform(x_train), index=train_index) x_test = DataFrame(lda.transform(x_test), index=test_index) if args.ica: fast_ica = FastICA(n_components=args.ica, whiten=True, random_state=True) train_index, test_index = x_train.index, x_test.index fast_ica.fit(x_train, y_train) x_train = DataFrame(fast_ica.transform(x_train), index=train_index) x_test = DataFrame(fast_ica.transform(x_test), index=test_index) if args.tsne: tsne = TSNE(n_components=args.tsne) train_index, test_index = x_train.index, x_test.index tsne.fit(x_train, y_train) x_train = tsne.fit_transform(x_train) import IPython IPython.embed() # This was the best thing I could do with my given architecture. if (args.run_chi2 or args.chi2_pruning) and args.pca: raise ValueError( "It doesn't make sense to run chi2 when using PCA!") if args.run_chi2: run_and_print_chi2(x_train, y_train) if args.chi2_pruning: patients = map(lambda x: x[1], x_test.index.str.split("-")) unique = set(patients) patient = "-".join(unique) x_train, x_test = perform_chi2_feature_pruning( x_train, x_test, y_train, args.chi2_pruning, args.write_chi2_results, args.pickle_file, patient) if not args.with_smote and (args.rfecv or args.l1_selector or args.grid_search): patient = map(lambda x: x[1], x_train.index.str.split("-")) x_train['patient'] = patient x_train.sort_values(by=['patient'], inplace=True) x_train = x_train.drop("patient", axis=1) if args.rfecv: x_train, x_test = classifier.backwards_feature_elimination( x_train, y_train, x_test) if args.l1_selector: x_train, x_test = classifier.l1_selection(x_train, y_train, x_test) if len(y_train) != 0: get_fa_elements(y_train, "training") get_fa_elements(y_test, "testing") else: get_fa_elements(y_test, "all") if len(x_train) != 0 and args.grid_search: classifier.grid_search(x_train, y_train) elif len(x_train) != 0 and args.cross_validate: classifier.cross_validate(x_train, y_train) elif len(x_train) != 0: classifier.fit(x_train, y_train) if args.train_on_all: classifier.write_to_file(args.model_file) scaler.to_pickle() winsorizor.to_pickle() return else: run_results = classifier.analyze_and_print_results( x_test, y_test, y_train, extra_info) results.append(run_results) if args.write_results: write_results(results, args)
def f_extract(X_train, X_test, y_train, y_test, method='26PCA', feature_limit=26): def str_split_num(s): tail = s.lstrip('0123456789') # use rstrip if num is last part of str head = s[0:-len(tail)] # negative to count from last char return int(head), tail if method[0].isdigit(): n_comps, method = str_split_num(method) print("Feature extraction using", method) if method == 'PCA': reducer = PCA(n_components=n_comps, whiten=True, random_state=rand).fit(X_train) X_train = reducer.transform(X_train) X_test = reducer.transform(X_test) if method == 'LDA': reducer = LinearDiscriminantAnalysis(n_components=n_comps).fit( X_train, y_train) X_train = reducer.transform(X_train) X_test = reducer.transform(X_test) if method == 'ICA': reducer = FastICA(n_components=n_comps, whiten=True, random_state=rand).fit(X_train, y_train) X_train = reducer.transform(X_train) X_test = reducer.transform(X_test) if method == 'LLE': # too slow reducer = LocallyLinearEmbedding(n_components=n_comps, random_state=rand, n_jobs=threads).fit(X_train, y_train) X_train = reducer.transform(X_train) X_test = reducer.transform(X_test) if method == 'TSNE': reducer = TSNE(n_components=n_comps, learning_rate=1000, metric='euclidean', n_iter=11, random_state=rand, n_jobs=threads).fit(X_train, y_train) X_train = reducer.transform(X_train) X_test = reducer.transform(X_test) if method == 'UMAP': # too slow ...angular_rp_forest=True, y_train, y_test = encode_labels(y_train, y_test) reducer = UMAP(n_components=n_comps, n_neighbors=15, metric='correlation', random_state=rand, min_dist=0.0, angular_rp_forest=True, n_epochs=15).fit(X_train, y_train) X_train = reducer.transform(X_train) X_test = reducer.transform(X_test) return X_train, X_test
def run_ica( ica_input: np.ndarray, iters: int, seed: int, verbose: bool = False ) -> Union[Tuple[np.ndarray, np.ndarray, bool], Tuple[np.ndarray, np.ndarray, bool, bool]]: """ ica_input -- an MxN numpy array; in the context of the decrosstalk problem, N=the number of timesteps; M=2 (first row is signal; second is crosstalk) iters -- an int; the number of iterative loops to try to go through to get the off-diagonal elements of the mixing matrix < 0.3 seed -- an int; the seed of the random number generator that will be fed to sklearn.decompositions.FastICA verbose -- if True, also returns a flag indicating whether or not ICA output signals had to be swapped (just used for testing) Returns ------- ica_output -- and MxN numpy array; ica_output[0,:] is the unmixed signal, ica_output[1,:] is the unmixed crosstalk (in the context of the decrosstalk problem) mixing -- the mixing matrix that gets from ica_input to ica_output np.dot(mixing, ica_output) will restore ica_input roi_demixed -- a boolean indicating whether or not the iteration to get the off-diagonal elements of the mixing matrix < 0.3 actually worked """ # Whiten observations # # NOTE: we whiten the data by hand and then call # FastICA() with whiten=False to avoid running # afoul of this bug in sklearn # # https://github.com/scikit-learn/scikit-learn/issues/17162 # # after this issue is resolved in sklearn, we can # revisit the possibility of using sklearn.FastICA's # internal whitening Ow, W, m = whiten_data(ica_input.transpose()) alpha = 1 beta = 1 it = 0 roi_demixed = False rng = np.random.RandomState(seed) while not roi_demixed and it <= iters: if alpha > 0.3 or beta > 0.3 or alpha < 0 or beta < 0: # Unmixing ica = FastICA(whiten=False, max_iter=10000, random_state=rng) ica.fit(Ow) # Reconstruct sources mixing_raw = ica.mixing_ # correcting for scale and offset: # applying inverse of whitening matrix M_hat = np.dot(np.linalg.inv(W), mixing_raw) # computing scaling matrix scale = np.dot(np.linalg.inv(M_hat), np.array([1, 1])) # applying scaling matrix mixing = M_hat * scale else: roi_demixed = True alpha = mixing[0, 1] beta = mixing[1, 0] it += 1 # recovering outputs using new mixing matrix Sos = np.dot(np.linalg.inv(mixing), ica_input) # fixing source assignment ambiguity (ica_output, swapped) = fix_source_assignment(ica_input, Sos) if swapped: new_mixing = np.zeros((2, 2), dtype=float) new_mixing[:, 1] = mixing[:, 0] new_mixing[:, 0] = mixing[:, 1] mixing = new_mixing if verbose: return ica_output, mixing, roi_demixed, swapped return ica_output, mixing, roi_demixed
if RP: from sklearn.random_projection import GaussianRandomProjection model = GaussianRandomProjection(n_components=num_components) if FA: from sklearn.cluster import FeatureAgglomeration model = FeatureAgglomeration(n_clusters=num_components) if PCA: from sklearn.decomposition import PCA model = PCA(n_components=num_components) if ICA: from sklearn.decomposition import FastICA model = FastICA(n_components=num_components) else: # IRIS from sklearn.datasets import load_iris iris = load_iris() data = scale(iris.data) # n_samples, n_features = data.shape # n_digits = len(np.unique(iris.target)) labels = iris.target num_components = 3 if RP: from sklearn.random_projection import GaussianRandomProjection model = GaussianRandomProjection(n_components=num_components)
# NMF for i in range(10): nmf = NMF() nmf.fit(digit_mat_array[i] + 1) plt.figure() for j in range(9): plt.subplot(3, 3, j) # plt.gray() plt.imshow(nmf.components_[j].reshape(16, 16)) plt.title("nmf_digit_%s_components" % i, y=-0.5) filename = "nmf_digit_%s_components.png" % i plt.savefig(filename) # ICA for i in range(10): ica = FastICA() ica.fit(digit_mat_array[i]) plt.figure() for j in range(9): plt.subplot(3, 3, j) # plt.gray() plt.imshow(ica.components_[j].reshape(16, 16)) plt.title("ica_digit_%s_components" % i, y=-0.5) filename = "ica_digit_%s_components.png" % i plt.savefig(filename) factor_array = [10, 20, 50, 250] for factor in factor_array: nmf = NMF(n_components=factor) nmf.fit(digit_mat_array[3] + 1) plt.figure()
def reduction_ica(self): ica = FastICA(n_components=len(self.columns)-2, random_state=0) x_reduced = ica.fit_transform(self.X_train) print("ICA: {}".format(x_reduced.shape))
execfile(home + '/research_code/graicar/load_MEG_data.py') elif len(sys.argv) > 1: subj = sys.argv[1] freq_band = None res_dir = home + '/data/results/graicar/fmri/' execfile(home + '/research_code/graicar/load_fMRI_data.py') else: # subj = 'JOAOCEOG' # freq_band = '8-13' # res_dir = home + '/data/results/graicar/meg/' # execfile(home + '/research_code/graicar/load_MEG_data.py') subj = 'subj1' freq_band = None es_dir = home + '/data/results/graicar/fmri/' execfile(home + '/research_code/graicar/load_fMRI_data.py') nreals = 60 ncomps = 30 rng = np.random.RandomState() for i in range(nreals): print 'Realization %d of %d' % (i + 1, nreals) ica = FastICA(n_components=ncomps, random_state=rng) # return the first dimension as the number of ICs ICs = ica.fit_transform(data).T if freq_band is not None: fname = res_dir + subj + '_' + freq_band + '_R%02d.npz' % i else: fname = res_dir + subj + '_R%02d.npz' % i np.savez(fname, ICs=ICs)
# print('classifiers config:') for k, reg in reg_scikit.items(): print('{0}={1}'.format(k, reg.get_params()), flush=True) #four known clusters clust = MiniBatchKMeans(n_clusters=n_clust, max_iter=1000, init_size=n_clust * 10) #decompositions tfs = {} tfs['svd'] = TruncatedSVD(n_components=nb_comp['svd'], random_state=seed_tf) tfs['pca'] = PCA(n_components=nb_comp['pca'], random_state=seed_tf) tfs['ica'] = FastICA(n_components=nb_comp['ica'], max_iter=250, random_state=seed_tf) tfs['grp'] = GaussianRandomProjection(n_components=nb_comp['grp'], eps=0.1, random_state=seed_tf) tfs['srp'] = SparseRandomProjection(n_components=nb_comp['srp'], dense_output=True, random_state=seed_tf) tfs['nmf'] = NMF(n_components=nb_comp['nmf'], shuffle=True, init='random', random_state=seed_tf) #embedding trees, depth, leafs = 25, 8, 32 #2 ** 8 = 256 embed = RandomTreesEmbedding(n_estimators=trees, max_depth=depth,
digitsY = digits['Class'].copy().values abalone = pd.read_hdf('./BASE/datasets.hdf', 'abalone') abaloneX = abalone.drop('Class', 1).copy().values abaloneY = abalone['Class'].copy().values abaloneX = StandardScaler().fit_transform(abaloneX) digitsX = StandardScaler().fit_transform(digitsX) clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] abalone_dims = range(1, 9) #raise #%% data for 1 ica = FastICA(random_state=5) kurt = {} for dim in abalone_dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(abaloneX) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(out + 'abalone scree.csv') ica = FastICA(random_state=5) kurt = {} for dim in dims: ica.set_params(n_components=dim)
def tmi_run_ica(img_data_trunc, num_comp, masking_array, affine_array, variance_threshold = 0.9, timeplot = False, timeplot_name=None, filetype='nii.gz', outname='ica.nii.gz'): ica = FastICA(n_components=int(num_comp),max_iter=1000, tol=0.00001) S_ = ica.fit_transform(img_data_trunc).T components = ica.components_.T #scaling fitcomps = np.copy(S_) fitcomps = zscaler(fitcomps) img_data_trunc = np.copy(fitcomps.T) # ram shouldn't be an issue here... np.savetxt("ICA_fit.csv", zscaler(components), fmt='%10.8f', delimiter=',') # variance explained. explained_total_var = np.zeros((int(num_comp))) explained_var_ratio = np.zeros((int(num_comp))) # total variance back_projection = ica.inverse_transform(S_.T) total_var = back_projection.var() for i in range(int(num_comp)): tempcomps = np.copy(S_) tempcomps[i,:] = 0 temp_back_proj = ica.inverse_transform(tempcomps.T) temp_var = temp_back_proj.var() explained_var_ratio[i] = total_var - temp_var explained_total_var[i] = (total_var - temp_var) / total_var print "ICA # %d; Percent of Total Variance %1.3f" % ((i+1),explained_total_var[i]*100) explained_var_ratio = explained_var_ratio / explained_var_ratio.sum() sum_total_variance_explained = explained_total_var.sum() print "Total variance explained by all components = %1.3f" % sum_total_variance_explained print "Re-ordering components" sort_mask = (-1*explained_total_var).argsort() if sum_total_variance_explained > variance_threshold: #sort data sort_mask = (-1*explained_total_var).argsort() np.savetxt("ICA_total_var.csv",explained_total_var[sort_mask], fmt='%1.5f', delimiter=',') np.savetxt("ICA_explained_var_ratio.csv",explained_var_ratio[sort_mask], fmt='%1.5f', delimiter=',') img_data_trunc=img_data_trunc[:,sort_mask] if filetype=='nii.gz': savenifti_v2(img_data_trunc, masking_array[0], outname, affine_array[0]) else: pointer = 0 position_array = [0] for i in range(len(masking_array)): pointer += len(masking_array[i][masking_array[i]==True]) position_array.append(pointer) del pointer for i in range(len(masking_array)): start = position_array[i] end = position_array[i+1] savemgh_v2(img_data_trunc[start:end], masking_array[i], "%d_%s" % (i,outname), affine_array[i]) # save outputs and ica functions for potential ica removal if os.path.exists('ICA_temp'): print 'ICA_temp directory exists' exit() else: os.makedirs('ICA_temp') np.save('ICA_temp/signals.npy',S_) pickle.dump( ica, open( "ICA_temp/icasave.p", "wb" ) ) return ica, sort_mask, sum_total_variance_explained
def _fastica(self): f = FastICA(random_state=self.random_state) return f.fit(self.Z.T).components_
print(f"Fold #{fold}") print("TRAIN:", index_subjects[train_index], "TEST:", index_subjects[test_index]) # load training and testing data print('Load training data... (view {})'.format(view)) train_data = np.concatenate([load_data(sub, view) for sub in index_subjects[train_index]]) print("Shape of the training data:", train_data.shape) print('Load testdata... (view {})'.format(view)) test_data = np.concatenate([load_data(sub, view) for sub in index_subjects[test_index]]) print("Shape of the test data:", test_data.shape) # Data normalization to range [-1, 1] # print("Data normalization to range [-1, 1]") scaler = MinMaxScaler() normalized_train_data = scaler.fit_transform(train_data) normalized_test_data = scaler.fit_transform(test_data) # intialize ica ica = FastICA(n_components=dim) # fit ica on training set ica.fit(normalized_train_data) # Apply the mapping (transform) to both the training set and the test set X_train_ica = ica.transform(normalized_train_data) X_test_ica = ica.transform(normalized_test_data) print("Original shape: ", normalized_train_data.shape) print("Transformed shape:", X_train_ica.shape) # Reconstruction of training data print("Reconstruction of training data... ") X_train_new = ica.inverse_transform(X_train_ica) print("Reconstructed matrix shape:", X_train_new.shape) mse = mean_squared_error(normalized_train_data, X_train_new)
def detectHeartRate(fps, video, sequence): def visualize(x, y, path): global IMAGE_IDX plt.figure(figsize=(15, 12)) plt.tick_params(labelsize=23) plt.plot(x, y[0], color='red', linewidth=2, linestyle='-') plt.plot(x, y[1], color='green', linewidth=2, linestyle='-') plt.plot(x, y[2], color='blue', linewidth=2, linestyle='-') plt.savefig(path + "/signal" + str(IMAGE_IDX) + '.jpg') plt.close() IMAGE_IDX += 1 frame_num = len(sequence) print("Frame number with human face:", frame_num) psd_path = "./results/psd/" + video[:video.index('/')] folder = os.path.exists(psd_path) if not folder: os.makedirs(psd_path) '''Normalize the RGB value''' sequence = np.array(sequence) x = [i for i in range(len(sequence))] sequence = signal.detrend(sequence, axis=0) visualize(x, sequence.T, psd_path) mean = np.mean(sequence, axis=0) std = np.std(sequence, axis=0) sequence = (sequence - mean) / std ''' Apply ICA to clear the RGB signals input shape:sequenceLength * 3 output shape: sequenceLength * 3 ''' predictions = [] n_window_frames = fps * WINDOW_LENGTH print("Frame number of sliding window: ", n_window_frames) print("Number of sliding windows:", len(sequence) / fps - WINDOW_LENGTH) for start_idx in range(0, len(sequence) - n_window_frames, fps): window = sequence[start_idx:start_idx + min(n_window_frames, len(sequence))] x = [i for i in range(min(n_window_frames, len(sequence)))] visualize(x, window.T, psd_path) '''Apply ICA method''' # print("ICA input shape:", window.shape) ica = FastICA(max_iter=2000, tol=0.1) transformed = ica.fit_transform(window) # print("output shape after ICA transformation:", transformed.shape) visualize(x, transformed.T, psd_path) '''Apply FFT method and PSD method''' powerSpec = np.abs(np.fft.fft(transformed, axis=0))**2 maxPwrSrc = np.max(powerSpec, axis=1) freqs = np.fft.fftfreq(len(transformed), 1.0 / fps) '''Filter the HR signals using the frequency band''' valid_idx = np.where((freqs >= MIN_HR_BPM / 60) & (freqs <= MAX_HR_BMP / 60)) valid_pwr = maxPwrSrc[valid_idx] valid_freqs = freqs[valid_idx] visualize(valid_freqs, powerSpec[valid_idx].T, psd_path) visualize(valid_freqs, valid_pwr.T, psd_path) '''Predict the heart rate''' max_pwr_idx = np.argmax(valid_pwr) predictions.append(valid_freqs[max_pwr_idx] * 60.0) return predictions
print pca_model sparse_pca = SparsePCA(n_components=50) sparse_pca_model = pca.fit(sparse_pca_data) sparse_pca_X_new = pca.fit_transform(X) joblib.dump(sparse_pca_model, 'sparse_pca_model.pkl') joblib.dump(sparse_pca_X_new, 'sparse_pca_X_new.pkl') print sparse_pca_model kernel_pca = KernelPCA(n_components=50) kernel_pca_model = kernel_pca.fit(kernel_pca_data) kernel_X_new = kernel_pca.fit_transform(X) joblib.dump(kernel_pca_model, 'kernel_pca_model.pkl') joblib.dump(kernel_X_new, 'kernel_X_new.pkl') fast_ica = FastICA(n_components=None) fast_ica_start = time.time() fast_ica_model = fast_ica.fit(fast_ica_data) fast_ica_end = time.time() print 'fast_ica fit time', fast_ica_end - fast_ica_start fast_ica_X_new = fast_ica.transform(X) joblib.dump(fast_ica_model, 'fast_ica_model.pkl') joblib.dump(fast_ica_X_new, 'fast_ica_X_new.pkl') print fast_ica_model ''' nmf = NMF(n_components=None) nmf_start = time.time() #nmf_model = nmf.fit(nmf_data) nmf_X_new = nmf.fit_transform(X) nmf_end = time.time() print 'nmf fit time', nmf_end - nmf_start
from sklearn.model_selection import train_test_split # irisデータの読み込み iris = datasets.load_iris() data = iris['data'] print(data.shape) # 散布図に描画 fig = plt.figure() plt.plot(data[:, 2], data[:, 3], 'k.') plt.xlabel("petalLength") plt.ylabel("petalWidth") plt.show() #ICAの実行 ICA = FastICA(n_components=2, random_state=0) #20個の基底(コンポネント)を作る X_transformed = ICA.fit_transform(data) # 色を分けて可視化する場合 features = iris.data[:, [0, 2]] plt.scatter(*features.T, c=[['orange', 'green', 'blue'][x] for x in iris.target]) plt.show() # 処理後に可視化する場合 fig = plt.figure() features = X_transformed[:, [0, 1]] plt.scatter(*features.T, c=[['orange', 'green', 'blue'][x] for x in iris.target]) plt.show() print('--------------------------------')
#before model perform some dimensionality reduction # PCA remaining_comp = 15 # number of dimensions the data is reduced to pca = PCA(n_components=remaining_comp, random_state=420) pca_train = pca.fit_transform(x_train) pca_test = pca.transform(x_test) # tSVD tsvd = TruncatedSVD(n_components=remaining_comp, random_state=420) tsvd_train = tsvd.fit_transform(x_train) tsvd_test = tsvd.transform(x_test) # ICA ica = FastICA(n_components=remaining_comp, random_state=420) ica_train = ica.fit_transform(x_train) ica_test = ica.transform(x_test) # GRP grp = GaussianRandomProjection(n_components=remaining_comp, eps=0.1, random_state=420) grp_train = grp.fit_transform(x_train) grp_test = grp.transform(x_test) # SRP srp = SparseRandomProjection(n_components=remaining_comp, dense_output=True, random_state=420) srp_train = srp.fit_transform(x_train) srp_test = srp.transform(x_test) # NMF nmf = NMF(n_components=remaining_comp, init='nndsvdar', random_state=420)
) logging.info('Step 1 - .. Done') X_player, y_player = df[stg.PLAYER_FEATURES], df[stg.PLAYER_TARGET] logging.info('Step 1 - Impute missing values with median ..') dump(X_player.median().to_dict(), join(stg.MODELS_DIR, stg.PLAYER_FEATURES_MEDIAN_FILENAME)) dump(X_player.median().to_dict(), join(stg.SUBMISSION_DIR, stg.PLAYER_FEATURES_MEDIAN_FILENAME)) X_player.fillna(X_player.median(), inplace=True) logging.info('Step 1 - .. Done') logging.info('Step 1 - Fit and save pipeline to predict players..') player_pipeline = make_pipeline( make_union(FastICA(tol=0.85), FunctionTransformer(copy)), ExtraTreesClassifier(n_estimators=75, max_depth=18, bootstrap=False, criterion="gini", max_features=0.1, min_samples_leaf=1, min_samples_split=2)) player_pipeline_light = make_pipeline( make_union(FastICA(tol=0.85), FunctionTransformer(copy)), ExtraTreesClassifier(n_estimators=75, max_depth=17, bootstrap=False, criterion="gini", max_features=0.1, min_samples_leaf=1,
def project(vector, shape, roimask=None, n_components=None, svd_multiplier=5, calc_residuals=True): ''' Apply an ica decomposition to the first axis of the input vector. If a roimask is provided, the flattened roimask will be used to crop the vector before decomposition. If n_components is not set, an adaptive svd threshold is used (see approximate_svd_linearity_transition), with the hyperparameter svd_mutliplier. Residuals lost in the ICA projection are captured if calc_residuals == True. This represents the signal lost by ICA compression. Arguments: vector: The (x*y, t) vector to be spatially ICA projected shape: The shape of the original movie (t,x,y) roimask: The roimask to crop the vectorized movie (x,y) n_components: Manually request a set number of ICA components svd_multiplier: The hyperparameter for svd adaptive thresholding calc_residuals: Whether to calculate spatial and temporal residuals of projection compression. Returns: components: A dictionary containing all the results, metadata, and information regarding the filter applied. mean: the original video mean roimask: the mask applied to the video before decomposing shape: the original shape of the movie array eig_mix: the ICA mixing matrix timecourses: the ICA component time series eig_vec: the eigenvectors n_components: the number of components in eig_vec (reduced to only have 25% of total components as noise) project_meta: The metadata for the ica projection expmeta: All metadata created for this class lag1: the lag-1 autocorrelation noise_components: a vector (n components long) to store binary representation of which components were detected as noise cutoff: the signal-noise cutoff value if the n_components was automatically set, the following additional keys are also returned in components svd_cutoff: the number of components originally decomposed lag1_full: the lag-1 autocorrelation of the full set of components decomposed before cropping to only 25% noise components svd_multiplier: the svd multiplier value used to determine cutoff ''' print('\nCalculating Eigenspace\n-----------------------') assert (vector.ndim == 2), ( 'vector was not a two-dimensional np array.' 'If input is a movie, be sure to convert shape to (xy, t)') if roimask is not None: print('Using roimask to crop video') assert roimask.size == vector.shape[0], \ 'Vector was not the same size as the cropped mask' print('Original size:', vector.shape) maskind = np.where(roimask.flat == 1) vector = vector[maskind] print('Reduced size:', vector.shape) mean = np.mean(vector, 0).flatten() vector = vector - mean components = {} components['mean'] = mean components['roimask'] = roimask components['shape'] = shape if svd_multiplier is None: svd_multiplier = 5 if vector.dtype == np.float16: vector = vector.astype('float32', copy=False) if n_components is None: print('Calculating ICA (with n_component SVD estimator)...') t0 = timer() try: u, ev, _ = linalg.svd(vector, full_matrices=False) except ValueError: # LAPACK error if matricies are too big u, ev, _ = linalg.svd(vector, full_matrices=False, lapack_driver='gesvd') components['svd_eigval'] = ev # get starting point for decomposition based on svd mutliplier * the approximate point of transition to linearity in tail of ev components cross_1 = approximate_svd_linearity_transition(ev) n_components = cross_1 * svd_multiplier components['increased_cutoff'] = 0 while True: print('\nCalculating ICA with', n_components, 'components...') w_init = u[:n_components, :n_components].astype('float64') ica = FastICA(n_components=n_components, random_state=1000, w_init=w_init) eig_vec = ica.fit_transform(vector) eig_mix = ica.mixing_ noise, cutoff = sort_noise(eig_mix.T) p_signal = (1 - noise.sum() / noise.size) * 100 if noise.size == shape[0]: # all components are being used break elif p_signal < 75: print('ICA components were under 75% signal ({0}% signal).'\ .format(p_signal)) break elif n_components >= shape[0]: print('ICA components were under 75% signal ({0}% signal).'\ .format(p_signal)) print('However, number of components is maxed out.') print('Using this decomposition...') break else: print('ICA components were over 75% signal ({0}% signal).'\ .format(p_signal)) print('Recalculating with more components...') n_components += n_components // 2 components['increased_cutoff'] += 1 if n_components > shape[0]: print('\nComponents maxed out!') print('\tAttempted:', n_components) n_components = shape[0] print('\tReduced to:', shape[0]) components['lag1_full'] = lag_n_autocorr(eig_mix.T, 1) components['svd_multiplier'] = svd_multiplier print('Cropping excess noise components') components['svd_cutoff'] = n_components reduced_n_components = int((noise.size - noise.sum()) * 1.25) print('reduced_n_components:', reduced_n_components) if reduced_n_components < n_components: print('Cropping', n_components, 'to', reduced_n_components) ev_sort = np.argsort(eig_mix.std(axis=0)) eig_vec = eig_vec[:, ev_sort][:, ::-1] eig_mix = eig_mix[:, ev_sort][:, ::-1] noise = noise[ev_sort][::-1] eig_vec = eig_vec[:, :reduced_n_components] eig_mix = eig_mix[:, :reduced_n_components] n_components = reduced_n_components noise = noise[:reduced_n_components] components['lag1_full'] = components['lag1_full'][ev_sort][::-1] else: print('Less than 75% signal. Not cropping excess noise.') components['noise_components'] = noise components['cutoff'] = cutoff t = timer() - t0 print('Independent Component Analysis took: {0} sec'.format(t)) else: print('Calculating ICA (' + str(n_components) + ' components)...') t0 = timer() ica = FastICA(n_components=n_components, random_state=1000) try: eig_vec = ica.fit_transform(vector) # Eigenbrains except ValueError: print('Calculation exceeded float32 maximum.') print('Trying again with float64 vector...') #value error if any value exceeds float32 maximum. #overcome this by converting to float64 eig_vec = ica.fit_transform(vector.astype('float64')) t = timer() - t0 print('Independent Component Analysis took: {0} sec'.format(t)) eig_mix = ica.mixing_ # sort components by their eig val influence (approximated by timecourse standard deviation) ev_sort = np.argsort(eig_mix.std(axis=0)) eig_vec = eig_vec[:, ev_sort][:, ::-1] eig_mix = eig_mix[:, ev_sort][:, ::-1] noise, cutoff = sort_noise(eig_mix.T) components['noise_components'] = noise components['cutoff'] = cutoff print('components shape:', eig_vec.shape) components['eig_mix'] = eig_mix components['timecourses'] = eig_mix.T n_components = eig_vec.shape[1] components['eig_vec'] = eig_vec components['n_components'] = n_components components['lag1'] = lag_n_autocorr(components['timecourses'], 1) if calc_residuals: try: vector = vector.astype('float64') rebuilt = rebuild(components, artifact_components='none', vector=True).T rebuilt -= rebuilt.mean(axis=0) vector -= vector.mean(axis=0) residuals = np.abs(vector - rebuilt) residuals_temporal = residuals.mean(axis=0) if roimask is not None: residuals_spatial = np.zeros(roimask.shape) residuals_spatial.flat[maskind] = residuals.mean(axis=1) else: residuals_spatial = np.reshape(residuals.mean(axis=1), (shape[1], shape[2])) components['residuals_spatial'] = residuals_spatial components['residuals_temporal'] = residuals_temporal except Exception as e: print('Residual Calculation Failed!!') print('\t', e) # Save filter metadata information about how and when movie was filtered in dictionary project_meta = {} project_meta['time_elapsed'] = t project_meta['date'] = \ datetime.now().strftime('%Y%m%d')[2:] fmt = '%Y-%m-%dT%H:%M:%SZ' project_meta['tstmp'] = \ datetime.now().strftime(fmt) project_meta['n_components'] = n_components components['project_meta'] = project_meta print('\n') return components
# plt.show() # Nope, actually health isn't really trending up at all recently. # How about some nice ICA with your data #FastICA(n_components=n_components, whiten=True), # Compute ICA # ica = FastICA(n_components=3) # S_ = ica.fit_transform(X) # Reconstruct signals ############################################### ## Number of components to use in ICA ## ############################################### ncomp = 13 ############################################### ica = FastICA(n_components=ncomp, whiten=True) ica.fit(bigdf) #icafittrans = ica.fit_transform(bigdf) icafittrans = ica.transform(bigdf) print(icafittrans.shape) icafittrans = pd.DataFrame(icafittrans) icafittrans.index = dfDateIndex A_ = ica.mixing_ # Get estimated mixing matrix # nrows: number of search terms # ncols: number of components (that we chose) # so each column is a list of that component's contribution to each search term # so if we sort a column, that will give us the top search terms for that component! # but let's make it a data frame and label everything correctly for convenience A_ = pd.DataFrame(A_) # set the row names
if l.strip() == "": continue sp = l.split(',') print(sp) nums[i, 0] = idxs[sp[0].strip()] # TODO i,1 nums[i, 2:] = [int(s) for s in sp[2:]] print(nums) print("PCA...") import matplotlib.pyplot as plt # from sklearn.decomposition import PCA # pca = PCA(n_components=3) from sklearn.decomposition import FastICA pca = FastICA(n_components=3) pca.fit(nums[:, 2:]) # print(pca.explained_variance_ratio_) # print(pca.singular_values_) import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111, projection='3d') import matplotlib.cm as cm colors = cm.rainbow(np.linspace(0, 1, len(names))) for i, xyz in enumerate(pca.transform(nums[:, 2:])): ax.scatter(xyz[0], xyz[1], xyz[2], color=colors[int(nums[i][0])]) ax.set_xlabel('X Label')
def run(self, df, train, test, nbite, reg, alpha=0.001, verbose=True): """ data: pandas dataframe nbite: nombre d'iterations reg: regularization parameter alpha: learning rate """ #moyenne item #baseline = tf.constant(np.tile(np.array(df.mean(axis=0)),(df.shape[1],1))) shape = df.shape #constante: la matrice R à reconstituer entièrement R = tf.constant(df.values) #variable tensorflow masque mask_tf_train = tf.Variable(train) mask_tf_test = tf.Variable(test) #variables tensorflow if self.init == "random": #U et I initialisés selon une loi normale et normalisés en divisant par k U = tf.Variable(np.abs( np.random.normal(scale=1. / self.k, size=(shape[0], self.k)).astype(np.float64)), name="U") I = tf.Variable(np.abs( np.random.normal(scale=1. / self.k, size=(self.k, shape[1])).astype(np.float64)), name="I") if self.init == "ica": matrix = dok_matrix(df.shape, dtype=np.float64) for i in range(df.shape[0]): for j in range(df.shape[1]): if not np.isnan(df.values[i, j]): matrix[i, j] = df.values[i, j] ica = FastICA(n_components=self.k) U = tf.Variable(np.abs(ica.fit_transform(matrix.toarray())), name="U") I = tf.Variable(np.abs(ica.components_), name="I") if self.init == "pca": matrix = dok_matrix(df.shape, dtype=np.float64) for i in range(df.shape[0]): for j in range(df.shape[1]): if not np.isnan(df.values[i, j]): matrix[i, j] = df.values[i, j] pca = PCA(n_components=self.k) U = tf.Variable(np.abs(pca.fit_transform(matrix.toarray())), name="U") I = tf.Variable(np.abs(pca.components_), name="I") R_pred = tf.matmul(U, I) #embeddings #beta: paramètre de regularization beta = tf.constant(reg, dtype=tf.float64, name="beta") #regularization L1 regularizer = beta * (tf.reduce_sum(U) + tf.reduce_sum(I)) #cout de l'algo NMF, norme matricielle de R - R_pred cost = tf.reduce_sum( tf.square( tf.boolean_mask(R, mask_tf_train) - tf.boolean_mask(R_pred, mask_tf_train))) cost += regularizer #contraintes de non-négativité de U et I clip_U = U.assign(tf.maximum(tf.zeros_like(U), U)) clip_I = I.assign(tf.maximum(tf.zeros_like(I), I)) clip = tf.group(clip_U, clip_I) #erreur MSE train mse_train = tf.reduce_mean(tf.square( tf.boolean_mask(R_pred, mask_tf_train) - tf.boolean_mask(R, mask_tf_train)), name="mse_train") mse_test = tf.reduce_mean(tf.square( tf.boolean_mask(R_pred, mask_tf_test) - tf.boolean_mask(R, mask_tf_test)), name="mse_test") #baseline MSE test #baselineMSE = tf.reduce_mean(tf.square(tf.boolean_mask(baseline, mask_tf_test) - tf.boolean_mask(R, mask_tf_test))) global_step = tf.Variable(0, trainable=False) if self.solver == "adam": optimizer = tf.train.AdamOptimizer(alpha).minimize( cost, global_step=global_step) elif self.solver == "sgd": optimizer = tf.train.GradientDescentOptimizer(alpha).minimize( cost, global_step=global_step) else: #optimiseur rmsprop optimizer = tf.train.RMSPropOptimizer(alpha).minimize( cost, global_step=global_step) costs = [] mses_train = [] mses_test = [] sess = tf.Session() sess.run(tf.initialize_all_variables()) for i in range(nbite): sess.run(optimizer) sess.run(clip) if i % 100 == 0: prout = sess.run(cost) lol = sess.run(mse_train) mdr = sess.run(mse_test) if verbose: print("cost: %f" % prout) print("mse train: %f" % lol) print("mse test: %f" % mdr) print("***************") costs.append((i, prout)) mses_train.append((i, lol)) mses_test.append((i, mdr)) learnt_U = sess.run(U) learnt_I = sess.run(I) #msebaseline = sess.run(baselineMSE) #if verbose: #print("baseline: ", msebaseline) sess.close() return learnt_U, learnt_I, { "mse_train": mses_train, "mse_test": mses_test, "cost": costs }