def test_feature_agglomeration(): n_clusters = 1 X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean) agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median) agglo_mean.fit(X) agglo_median.fit(X) assert_true(np.size(np.unique(agglo_mean.labels_)) == n_clusters) assert_true(np.size(np.unique(agglo_median.labels_)) == n_clusters) assert_true(np.size(agglo_mean.labels_) == X.shape[1]) assert_true(np.size(agglo_median.labels_) == X.shape[1]) # Test transform Xt_mean = agglo_mean.transform(X) Xt_median = agglo_median.transform(X) assert_true(Xt_mean.shape[1] == n_clusters) assert_true(Xt_median.shape[1] == n_clusters) assert_true(Xt_mean == np.array([1 / 3.])) assert_true(Xt_median == np.array([0.])) # Test inverse transform X_full_mean = agglo_mean.inverse_transform(Xt_mean) X_full_median = agglo_median.inverse_transform(Xt_median) assert_true(np.unique(X_full_mean[0]).size == n_clusters) assert_true(np.unique(X_full_median[0]).size == n_clusters) assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean) assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with ignore_warnings(): ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def test_feature_agglomeration(): n_clusters = 1 X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean) agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median) assert_no_warnings(agglo_mean.fit, X) assert_no_warnings(agglo_median.fit, X) assert np.size(np.unique(agglo_mean.labels_)) == n_clusters assert np.size(np.unique(agglo_median.labels_)) == n_clusters assert np.size(agglo_mean.labels_) == X.shape[1] assert np.size(agglo_median.labels_) == X.shape[1] # Test transform Xt_mean = agglo_mean.transform(X) Xt_median = agglo_median.transform(X) assert Xt_mean.shape[1] == n_clusters assert Xt_median.shape[1] == n_clusters assert Xt_mean == np.array([1 / 3.]) assert Xt_median == np.array([0.]) # Test inverse transform X_full_mean = agglo_mean.inverse_transform(Xt_mean) X_full_median = agglo_median.inverse_transform(Xt_median) assert np.unique(X_full_mean[0]).size == n_clusters assert np.unique(X_full_median[0]).size == n_clusters assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean) assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", DeprecationWarning) if hasattr(np, 'VisibleDeprecationWarning'): # Let's not catch the numpy internal DeprecationWarnings warnings.simplefilter('ignore', np.VisibleDeprecationWarning) ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) assert_equal(len(warning_list), 1) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", DeprecationWarning) if hasattr(np, 'VisibleDeprecationWarning'): # Let's not catch the numpy internal DeprecationWarnings warnings.simplefilter('ignore', np.VisibleDeprecationWarning) ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) assert_equal(len(warning_list), 1) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with ignore_warnings(): ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def TrainRFRegression(df1): # generate the equation to use for our design matrices eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29']) # build our design matrices y, X = dmatrices(eqn, data=df1, return_type='dataframe') # employ clustering to reduce our dimensionality X_reduction = FeatureAgglomeration(n_clusters=50).fit(X, pd.np.ravel(y)) reduced_X = X_reduction.transform(X) # define our regressor mod = RandomForestRegressor(n_estimators=50) # fit our data res = mod.fit(reduced_X, pd.np.ravel(y)) # evaluate our fit yp = pd.DataFrame({'predicted': res.predict(reduced_X)}) yp = yp['predicted'] yt = y['regressand'] r2 = metrics.r2_score(yt, yp) rmse = metrics.mean_absolute_error(yt, yp) # save our model, including scalers and feature agglomerator with open('RFR_trained_model.pickle', 'wb') as output: pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) return r2, rmse
def feature_agglomeration(voters_data, n, rounding=False): featagg = FeatureAgglomeration(n_clusters=n) featagg.fit(voters_data) condensed = featagg.transform(voters_data) feature_groups_map = dict(zip(voters_data.columns, featagg.labels_)) feature_groups_nos = [] for feature_group_key in feature_groups_map: feature_groups_nos.append(feature_groups_map[feature_group_key]) feature_groups_nos group_labels = [] for feature_group_no in set(feature_groups_nos): group_label = "" for feature_groups_key in feature_groups_map: if feature_groups_map[feature_groups_key] == feature_group_no: group_label = group_label + feature_groups_key + ", " group_labels.append(group_label[0:-2]) group_labels voters_agglomerated = pd.DataFrame(condensed, columns=group_labels, index=voters_data.index) if rounding == True: voters_agglomerated = voters_agglomerated.applymap(lambda x: round(x)) print("🔹→💠←🔹 {} features agglomerated into {} hybrid features.".format( len(voters_data.columns), len(voters_agglomerated.columns))) return voters_agglomerated
def TestSGDRegression(df1): # generate the equation to use for our design matrices eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29', 'X13']) # build our design matrices X = dmatrix(eqn.replace('regressand ~ ', '0+'), data=df1, return_type='dataframe') # load our model, including scalers and feature agglomerator with open('SGD_trained_model.pickle', 'rb') as input: res = pickle.load(input) # employ clustering to reduce our dimensionality X_reduction = FeatureAgglomeration(n_clusters=n_clusters).fit(X) reduced_X = X_reduction.transform(X) # standardize our data X_scaler = StandardScaler().fit(reduced_X) std_X = X_scaler.transform(reduced_X) # predict the interest rates yp = res.predict(std_X) return yp
class Regressor(BaseEstimator): def __init__(self): self.clf = Pipeline([ ("RF", RandomForestRegressor(n_estimators=200, max_depth=15, n_jobs=N_JOBS))]) self.scaler = StandardScaler() self.agglo = FeatureAgglomeration(n_clusters=500) def fit(self, X, y): y = y.ravel() n_samples, n_lags, n_lats, n_lons = X.shape self.scaler.fit(X[:, -1].reshape(n_samples, -1)) X = X.reshape(n_lags * n_samples, -1) connectivity = grid_to_graph(n_lats, n_lons) self.agglo.connectivity = connectivity X = self.scaler.transform(X) X = self.agglo.fit_transform(X) X = X.reshape(n_samples, -1) self.clf.fit(X, y) def predict(self, X): n_samples, n_lags, n_lats, n_lons = X.shape X = X.reshape(n_lags * n_samples, -1) X = self.scaler.transform(X) X = self.agglo.transform(X) X = X.reshape(n_samples, -1) return self.clf.predict(X)
class FeatureAgglomerationDecomposer(Transformer): type = 11 def __init__(self, n_clusters=2, affinity='euclidean', linkage='ward', pooling_func='mean', random_state=1): super().__init__("feature_agglomeration_decomposer") self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL] self.compound_mode = 'only_new' self.output_type = NUMERICAL self.n_clusters = n_clusters self.affinity = affinity self.linkage = linkage self.pooling_func = pooling_func self.random_state = random_state self.pooling_func_mapping = dict(mean=np.mean, median=np.median, max=np.max) @ease_trans def operate(self, input_datanode, target_fields=None): from sklearn.cluster import FeatureAgglomeration X, y = input_datanode.data if self.model is None: self.n_clusters = int(self.n_clusters) n_clusters = min(self.n_clusters, X.shape[1]) if not callable(self.pooling_func): self.pooling_func = self.pooling_func_mapping[self.pooling_func] self.model = FeatureAgglomeration( n_clusters=n_clusters, affinity=self.affinity, linkage=self.linkage, pooling_func=self.pooling_func) self.model.fit(X) X_new = self.model.transform(X) return X_new @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): cs = ConfigurationSpace() n_clusters = UniformIntegerHyperparameter("n_clusters", 2, 400, default_value=25) affinity = CategoricalHyperparameter( "affinity", ["euclidean", "manhattan", "cosine"], default_value="euclidean") linkage = CategoricalHyperparameter( "linkage", ["ward", "complete", "average"], default_value="ward") pooling_func = CategoricalHyperparameter( "pooling_func", ["mean", "median", "max"], default_value="mean") cs.add_hyperparameters([n_clusters, affinity, linkage, pooling_func]) affinity_and_linkage = ForbiddenAndConjunction( ForbiddenInClause(affinity, ["manhattan", "cosine"]), ForbiddenEqualsClause(linkage, "ward")) cs.add_forbidden_clause(affinity_and_linkage) return cs
def featureagglomeration(data_train, data_test, label_train, label_test, args): print('feature agglomeration') FA = FeatureAgglomeration(n_clusters=10).fit(data_train) transformation = FA.transform(data_test) agglomeration = find_highest(transformation) print('feature agglomeration done') compare_class(agglomeration, label_test) if args.create_mean: create_images_from_rows('fa', mean_image(agglomeration, data_test))
def test_ward_agglomeration(): # Check that we obtain the correct solution in a simplistic case rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert np.size(np.unique(agglo.labels_)) == 5 X_red = agglo.transform(X) assert X_red.shape[1] == 5 X_full = agglo.inverse_transform(X_red) assert np.unique(X_full[0]).size == 5 assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError with pytest.raises(ValueError): agglo.fit(X[:0])
def main(): # Parameters data_directory = '../../data/generated-data-r-10-n-6-4/' features_path = '../../data/features-generated-data-r-10-n-6-4' booking_file = '../../data/booking.csv' users_file = '../../data/user.csv' rating_thresholds = [] true_objects_indexes = [0, 1, 2, 3, 4, 5] false_objects_indexes = [6, 7, 8, 9] file_names = os.listdir(data_directory) img_ids_vector = [int(name.split('-')[0]) for name in file_names] ratings_vector = [int(name.split('-')[-2]) for name in file_names] name_vector = [data_directory + name for name in file_names] images_indexes = [name.split('-')[3].split('.')[0] for name in file_names] ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds) features = get_features(features_path, name_vector) fa = FeatureAgglomeration(n_clusters=50) fa.fit(features) features = fa.transform(features) scores_auc = [] scores_rmse = [] for i in range(10): cv_results_file = '../results/cv-generated-data-r-10-n-6-4-rf-fa-' + str( i) + '.csv' selection = ObjectSelection(show_selection_results=False, selection_algorithm='rf') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, z_score=False) selection.evaluate(evaluation_metric='auc') selection.evaluate(evaluation_metric='rmse') print('\n\n-----\n\n') score_auc, score_rmse = selection.evaluate(evaluation_metric='auc') scores_auc.append(score_auc) scores_rmse.append(score_rmse) results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-auc.csv' save_scores(scores_auc, results_file) results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-rmse.csv' save_scores(scores_rmse, results_file)
def perform_feature_agglomeration(train_X, train_Y, test_X, test_Y): n_clusters = [32] fagg_model_accuracies = pd.DataFrame() for n_cluster in n_clusters: agglo = FeatureAgglomeration(connectivity=None, n_clusters=n_cluster) agglo.fit(train_X) train_X_reduced = agglo.transform(train_X) test_X_reduced = agglo.transform(test_X) svc_acc_val = perform_svc(train_X_reduced, train_Y, test_X_reduced, test_Y) rfc_acc_val = perform_rfc(train_X_reduced, train_Y, test_X_reduced, test_Y) knn_acc_val = perform_knn(train_X_reduced, train_Y, test_X_reduced, test_Y) lr_acc_val = perform_linear_regression(train_X_reduced, train_Y, test_X_reduced, test_Y) lc_acc_val = perform_linear_lasso(train_X_reduced, train_Y, test_X_reduced, test_Y) rr_acc_val = perform_ridge_regression(train_X_reduced, train_Y, test_X_reduced, test_Y) enet_acc_val = perform_elastinet_regression(train_X_reduced, train_Y, test_X_reduced, test_Y) fagg_model_accuracies = fagg_model_accuracies.append([ svc_acc_val, rfc_acc_val, knn_acc_val, lr_acc_val, lc_acc_val, rr_acc_val, enet_acc_val ]) cols = list(fagg_model_accuracies.columns.values) cols = cols[-1:] + cols[:-1] fagg_model_accuracies = fagg_model_accuracies[cols] fagg_model_accuracies = fagg_model_accuracies.sort_values( by='r2_score') return fagg_model_accuracies
def _ward_fit_transform(all_subjects_data, fit_samples_indices, connectivity, n_parcels, offset_labels): """Ward clustering algorithm on a subsample and apply to the whole dataset. Computes a brain parcellation using Ward's clustering algorithm on some images, then averages the signal within parcels in order to reduce the dimension of the images of the whole dataset. This function is used with Randomized Parcellation Based Inference, so we need to save the labels to further perform the inverse transformation operation. The function therefore needs an offset to be applied on the labels so that they are unique across parcellations. Parameters ---------- all_subjects_data : array_like, shape=(n_samples, n_voxels) Masked subject images as an array. fit_samples_indices : array-like, Indices of the samples used to compute the parcellation. connectivity : scipy.sparse.coo_matrix, Graph representing the spatial structure of the images (i.e. connections between voxels). n_parcels : int, Number of parcels for the parcellations. offset_labels : int, Offset for labels numbering. The purpose is to have different labels in all the parcellations that can be built by multiple calls to the current function. Returns ------- parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels) Average signal within each parcel for each subject. labels : np.ndarray, shape=(n_voxels,) Labels giving the correspondance between voxels and parcels. """ # fit part data_fit = all_subjects_data[fit_samples_indices] ward = FeatureAgglomeration(n_clusters=n_parcels, connectivity=connectivity) ward.fit(data_fit) # transform part labels = ward.labels_ + offset_labels # unique labels across parcellations parcelled_data = ward.transform(all_subjects_data) return parcelled_data, labels
def _ward_fit_transform(all_subjects_data, fit_samples_indices, connectivity, n_parcels, offset_labels): """Ward clustering algorithm on a subsample and apply to the whole dataset. Computes a brain parcellation using Ward's clustering algorithm on some images, then averages the signal within parcels in order to reduce the dimension of the images of the whole dataset. This function is used with Randomized Parcellation Based Inference, so we need to save the labels to further perform the inverse transformation operation. The function therefore needs an offset to be applied on the labels so that they are unique across parcellations. Parameters ---------- all_subjects_data : array_like, shape=(n_samples, n_voxels) Masked subject images as an array. fit_samples_indices : array-like, Indices of the samples used to compute the parcellation. connectivity : scipy.sparse.coo_matrix, Graph representing the spatial structure of the images (i.e. connections between voxels). n_parcels : int, Number of parcels for the parcellations. offset_labels : int, Offset for labels numbering. The purpose is to have different labels in all the parcellations that can be built by multiple calls to the current function. Returns ------- parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels) Average signal within each parcel for each subject. labels : np.ndarray, shape=(n_voxels,) Labels giving the correspondance between voxels and parcels. """ # fit part data_fit = all_subjects_data[fit_samples_indices] ward = FeatureAgglomeration(n_clusters=n_parcels, connectivity=connectivity) ward.fit(data_fit) # transform part labels = ward.labels_ + offset_labels # unique labels across parcellations parcelled_data = ward.transform(all_subjects_data) return parcelled_data, labels
def test_feature_agglomeration(): n_clusters = 1 X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean) agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median) with pytest.warns(None) as record: agglo_mean.fit(X) assert not [w.message for w in record] with pytest.warns(None) as record: agglo_median.fit(X) assert not [w.message for w in record] assert np.size(np.unique(agglo_mean.labels_)) == n_clusters assert np.size(np.unique(agglo_median.labels_)) == n_clusters assert np.size(agglo_mean.labels_) == X.shape[1] assert np.size(agglo_median.labels_) == X.shape[1] # Test transform Xt_mean = agglo_mean.transform(X) Xt_median = agglo_median.transform(X) assert Xt_mean.shape[1] == n_clusters assert Xt_median.shape[1] == n_clusters assert Xt_mean == np.array([1 / 3.0]) assert Xt_median == np.array([0.0]) # Test inverse transform X_full_mean = agglo_mean.inverse_transform(Xt_mean) X_full_median = agglo_median.inverse_transform(Xt_median) assert np.unique(X_full_mean[0]).size == n_clusters assert np.unique(X_full_median[0]).size == n_clusters assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean) assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def data_compression(fmri_masked, mask_img, mask_np, output_size): """ data : array_like A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints The functional dataset that needs to be reduced mask : a numpy array of the mask output_size : integer The number of elements that the data should be reduced to """ ## Transform nifti files to a data matrix with the NiftiMasker import time from nilearn import input_data datacompressiontime = time.time() nifti_masker = input_data.NiftiMasker(mask_img=mask_img, memory='nilearn_cache', mask_strategy='background', memory_level=1, standardize=False) ward = [] # Perform Ward clustering from sklearn.feature_extraction import image shape = mask_np.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask_np) #import pdb;pdb.set_trace() from sklearn.cluster import FeatureAgglomeration start = time.time() ward = FeatureAgglomeration(n_clusters=output_size, connectivity=connectivity, linkage='ward') ward.fit(fmri_masked) #print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start)) labels = ward.labels_ #print ('Extracting reduced Dimension Data') data_reduced = ward.transform(fmri_masked) fmri_masked = [] #print('Data compression took ', (time.time()- datacompressiontime), ' seconds') return {'data': data_reduced, 'labels': labels}
def data_compression(fmri_masked, mask_img, mask_np, compression_dim): # TODO @AKI update doc """ Perform... Parameters ---------- fmri_masked : np.ndarray[ndim=2] A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints The functional dataset that needs to be reduced mask_img : an nibabel img object of the mask mask_np : a numpy array of the mask compression_dim : integer The number of elements that the data should be reduced to Returns ------- A dictionaty ... """ from sklearn.feature_extraction import image from sklearn.cluster import FeatureAgglomeration # Perform Ward clustering shape = mask_np.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask_np) ward = FeatureAgglomeration(n_clusters=compression_dim, connectivity=connectivity, linkage='ward') ward.fit(fmri_masked) labels = ward.labels_ data_reduced = ward.transform(fmri_masked) return { 'compressor': ward, 'compressed': data_reduced, 'labels': labels, }
def TrainSGDRegression(df1): # generate the equation to use for our design matrices eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29', 'X13']) # build our design matrices y, X = dmatrices(eqn, data=df1, return_type='dataframe') # employ clustering to reduce our dimensionality X_reduction = FeatureAgglomeration(n_clusters=n_clusters).fit(X) reduced_X = X_reduction.transform(X) X_scaler = StandardScaler().fit(reduced_X) std_X = X_scaler.transform(reduced_X) # y_scaler = StandardScaler().fit(y) # standardize our data # std_y = y_scaler.transform(y) # define our regressor mod = SGDRegressor(loss='epsilon_insensitive', penalty='elasticnet', alpha=0.0014, epsilon=0.32, n_iter=n_iterations) # fit our data #res = mod.fit(std_X,pd.np.ravel(std_y)) res = mod.fit(std_X, pd.np.ravel(y)) # evaluate our fit yp = res.predict(std_X) yp = pd.DataFrame({'predicted': yp}) yp = yp['predicted'] yt = y['regressand'] r2 = metrics.r2_score(yt, yp) rmse = metrics.mean_absolute_error(yt, yp) #save our model with open('SGD_trained_model.pickle', 'wb') as output: pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) return r2, rmse
def run_evaluation(): parser = argparse.ArgumentParser() parser.add_argument('--input', help="Image folder.", default="faces") parser.add_argument('--output', help="Statistics output folder.", default="stats") args = parser.parse_args() # load embeddings emb_1 = load_embeddings('embeddings_matthias.pkl') emb_2 = load_embeddings('embeddings_laia.pkl') emb_3 = load_embeddings('embeddings_elias.pkl') emb_lfw = load_embeddings('embeddings_lfw.pkl') if emb_1 is None or emb_2 is None: print "--- embeddings could not be loaded. Aborting..." return # ------------------- EVALUATION ON ORIGINAL VECTORS ph = PlotHandler() # ==== 1. PCA DIMENSION REDUCTION # ph.PlotVarianceContribution(emb_lfw) # # reduce dimensionality # basis, mean = ExtractSubspace(emb_lfw, 0.999) # # dump_to_hd("lfw_99.9_subspace.pkl", (basis, mean)) # # reduced_data = ProjectOntoSubspace(emb_lfw, mean, basis) # ph.SetTitle("Component Variance Contribution on Subspace") # ph.PlotVarianceContribution(reduced_data) # ph.Show() # ==== 1. FEATURE AGGLOMERATION agglo = FeatureAgglomeration(n_clusters=20) agglo.fit(emb_lfw) X_reduced = agglo.transform(emb_1) print np.shape(X_reduced)
def test_random_feature_agglomeration_encoder_load(): train_data = np.random.rand(2000, input_dim) from sklearn.cluster import FeatureAgglomeration model = FeatureAgglomeration(n_clusters=target_output_dim) filename = 'feature_agglomeration_model.model' pickle.dump(model.fit(train_data), open(filename, 'wb')) encoder = TransformEncoder(model_path=filename) test_data = np.random.rand(10, input_dim) encoded_data = encoder.encode(test_data) transformed_data = model.transform(test_data) assert encoded_data.shape == (test_data.shape[0], target_output_dim) assert type(encoded_data) == np.ndarray np.testing.assert_almost_equal(transformed_data, encoded_data) save_and_load(encoder, False) save_and_load_config(encoder, False, train_data) rm_files([encoder.save_abspath, encoder.config_abspath, filename])
def TestRFRegression(df1): # generate the equation to use for our design matrices eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29']) # build our design matrices X = dmatrix(eqn.replace('regressand ~ ', ''), data=df1, return_type='dataframe') # load our model, including scalers and feature agglomerator with open('RFR_trained_model.pickle', 'rb') as input: res = pickle.load(input) # employ clustering to reduce our dimensionality X_reduction = FeatureAgglomeration(n_clusters=50).fit(X) reduced_X = X_reduction.transform(X) # define our regressor mod = RandomForestRegressor(n_estimators=50) # predict the interest rates yp = pd.DataFrame({'predicted': res.predict(reduced_X)}) return yp
data1_X_ica = ica1.fit_transform(data1_X_train) data1_X_ica_test = ica1.transform(data1_X_test) ica2 = FastICA(n_components=90) data2_X_ica = ica2.fit_transform(data2_X_train) data2_X_ica_test = ica2.transform(data2_X_test) grp1 = GaussianRandomProjection(n_components=20) data1_X_grp = grp1.fit_transform(data1_X_train) data1_X_grp_test = grp1.transform(data1_X_test) grp2 = GaussianRandomProjection(n_components=90) data2_X_grp = grp2.fit_transform(data2_X_train) data2_X_grp_test = grp2.transform(data2_X_test) fa1 = FeatureAgglomeration(n_clusters=20) data1_X_fa = fa1.fit_transform(data1_X_train) data1_X_fa_test = fa1.transform(data1_X_test) fa2 = FeatureAgglomeration(n_clusters=90) data2_X_fa = fa2.fit_transform(data2_X_train) data2_X_fa_test = fa2.transform(data2_X_test) ''' clustering ''' clusters = np.logspace(0.5, 2, num=10, endpoint=True, base=10.0, dtype=None) for i in range(0, len(clusters)): clusters[i] = int(clusters[i]) print clusters
from sklearn.linear_model import SGDRegressor from DataTransformations import * df1=transform_data(sm.load('full_chain_data.pickle')) base_parameters = {'alpha' : [.00001,.0001,.001,.01, .1] , \ 'epsilon' : [.1, .2, .3], \ 'penalty' : ['l2', 'elasticnet'], \ 'loss' : ['huber', 'epsilon_insensitive']} eqn = build_eqn(df1,'regressand', ['any_regressand','X25','X26']) print(eqn) y, X = dmatrices(eqn, data=df1,return_type = 'dataframe') print('design matrices generated') X_reduction = FeatureAgglomeration(n_clusters=100).fit(X,pd.np.ravel(y)) reduced_X = X_reduction.transform(X) X_scaler = StandardScaler().fit(reduced_X) std_X = X_scaler.transform(reduced_X) y_scaler = StandardScaler().fit(y) std_y = y_scaler.transform(y) print (std_y.shape) #svr = SGDRegressor(n_iter = 20, penalty = 'elasticnet', loss='epsilon_insensitive') svr = SGDRegressor(n_iter = 30, penalty = 'elasticnet', loss= 'epsilon_insensitive', alpha = .0014, epsilon = .32) #parameters = { 'alpha' : pd.np.arange(.001,.002,.0001), 'epsilon' : pd.np.arange(.25,.35,.01)} parameters = { 'l1_ratio' : pd.np.arange(.1,.6,.02)} SGD_clf = GridSearchCV(svr, parameters, verbose = True) SGD_clf.fit(std_X, pd.np.ravel(std_y))
fulldata = np.concatenate((X, test), axis=0) # agg = FeatureAgglomeration(n_clusters = 2000) # print ("fitting") # agg.fit(fulldata) # print "transform" # fulldata_agg = agg.transform(fulldata) first500 = fulldata[:,:500] second = fulldata[:,500:] agg = FeatureAgglomeration(n_clusters = 200) print ("fitting") agg.fit(first500) first500_agg = agg.transform(first500) agg = FeatureAgglomeration(n_clusters=1900) print ("fitting") agg.fit(second) second_agg = agg.transform(second) # new_X = fulldata[:9501,:] # new_test = fulldata[9501:,:] new_X = np.concatenate((first500_agg[:9501,:], second_agg[:9501,:]), axis=1) new_test = np.concatenate((first500_agg[9501:,:], second_agg[9501:,:]), axis=1) print("saving data...") np.savetxt("../Data/train_agg_2100.csv", new_X, delimiter=",") print("saving data...")
def train_and_test(alpha, predictors, predictor_params, x_filename, y_filename, n_users, percTest, featureset_to_use, diff_weighting, phi, force_balanced_classes, do_scaling, optimise_predictors, report, conf_report=None): # all_X = numpy.loadtxt(x_filename, delimiter=",") all_X = numpy.load(x_filename + ".npy") all_y = numpy.loadtxt(y_filename, delimiter=",") print("loaded X and y files", x_filename, y_filename) if numpy.isnan(all_X.any()): print("nan in", x_filename) exit() if numpy.isnan(all_y.any()): print("nan in", y_filename) exit() #print("selecting balanced subsample") print("t t split") X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=percTest, random_state=666) # feature extraction # test = SelectKBest(score_func=chi2, k=100) # kb = test.fit(X_train, y_train) # # summarize scores # numpy.set_printoptions(precision=3) # print(kb.scores_) # features = kb.transform(X_train) # mask = kb.get_support() # # summarize selected features # print(features.shape) # X_train = X_train[:,mask] # X_test = X_test[:,mask] scaler = StandardScaler() rdim = FeatureAgglomeration(n_clusters=100) if do_scaling: # input(X_train.shape) X_train = rdim.fit_transform(X_train) X_test = rdim.transform(X_test) X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) with open('../../../isaac_data_files/qutor_scaler.pkl', 'wb') as output: pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL) with open('../../../isaac_data_files/qutor_rdim.pkl', 'wb') as output: pickle.dump(rdim, output, pickle.HIGHEST_PROTOCOL) # print("feature reduction...") # pc = PCA(n_components=100) # X_train = pc.fit_transform(X_train) # X_test = pc.transform(X_test) classes = numpy.unique(y_train) sample_weights = None if (force_balanced_classes): X_train, y_train = balanced_subsample(X_train, y_train, 1.0) #0.118) print("X_train shape:", X_train.shape) print("X_test shape:", X_test.shape) print("tuning classifier ...") for ix, p in enumerate(predictors): print(type(p)) print(p.get_params().keys()) if optimise_predictors == True and len(predictor_params[ix]) > 1: pbest = run_random_search(p, X_train, y_train, predictor_params[ix]) else: pbest = p.fit(X_train, y_train) predictors[ix] = pbest print("pickling classifier ...") for ix, p in enumerate(predictors): p_name = predictor_params[ix]['name'] with open( '../../../isaac_data_files/p_{}_{}_{}.pkl'.format( p_name, alpha, phi), 'wb') as output: pickle.dump(p, output, pickle.HIGHEST_PROTOCOL) print("done!") # report.write("* ** *** |\| \` | | |) /; `|` / |_| *** ** *\n") # report.write("* ** *** | | /_ |^| |) || | \ | | *** ** *\n") #report.write("RUNS,P,FB,WGT,ALPHA,PHI,SCL,0p,0r,0F,0supp,1p,1r,1F,1supp,avg_p,avg_r,avg_F,#samples\n") for ix, p in enumerate(predictors): report.write(",".join( map(str, (all_X.shape[0], str(p).replace(",", ";").replace( "\n", ""), force_balanced_classes, diff_weighting, alpha, phi, do_scaling)))) y_pred_tr = p.predict(X_train) y_pred = p.predict(X_test) # for x,y,yp in zip(X_train, y_test, y_pred): if conf_report: conf_report.write( str(p).replace(",", ";").replace("\n", "") + "\n") conf_report.write(str(alpha) + "," + str(phi) + "\n") conf_report.write(str(confusion_matrix(y_test, y_pred)) + "\n") conf_report.write("\n") # p = precision_score(y_test, y_pred, average=None, labels=classes) # r = recall_score(y_test, y_pred, average=None, labels=classes) # F = f1_score(y_test, y_pred, average=None, labels=classes) p, r, F, s = precision_recall_fscore_support(y_test, y_pred, labels=classes, average=None, warn_for=('precision', 'recall', 'f-score')) avp, avr, avF, _ = precision_recall_fscore_support( y_test, y_pred, labels=classes, average='weighted', warn_for=('precision', 'recall', 'f-score')) for ix, c in enumerate(classes): report.write(",{},{},{},{},{},".format(c, p[ix], r[ix], F[ix], s[ix])) report.write("{},{},{},{}\n".format(avp, avr, avF, numpy.sum(s))) # report.write(classification_report(y_test, y_pred)+"\n") # report.write("------END OF CLASSIFIER------\n") report.flush() return X_train, X_test, y_pred_tr, y_pred, y_test, scaler
DT1 = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=5, max_depth=None) error_rate_train_DT_1 = sum( DT1.fit(data1_X_train, data1_y_train).predict(data1_X_train) == data1_y_train) * 1.0 / data1_y_train.shape[0] print "error_rate_train_DT_1", error_rate_train_DT_1 error_rate_test_DT_1 = sum( DT1.fit(data1_X_train, data1_y_train).predict(data1_X_test) == data1_y_test) * 1.0 / data1_y_test.shape[0] print "error_rate_test_DT_2", error_rate_test_DT_1 for i in range(0, np.shape(data1_X_train)[1]): print i start_time = time.time() fa.set_params(n_clusters=i + 1) data1_X_train_fa = fa.fit_transform(data1_X_train) data1_X_test_fa = fa.transform(data1_X_test) error_rate_train_1[i] = sum( DT1.fit(data1_X_train_fa, data1_y_train).predict(data1_X_train_fa) == data1_y_train) * 1.0 / \ data1_y_train.shape[0] print("error_rate_train_1[%f]" % i), error_rate_train_1[i] error_rate_test_1[i] = sum( DT1.fit(data1_X_train_fa, data1_y_train).predict(data1_X_test_fa) == data1_y_test) * 1.0 / \ data1_y_test.shape[0] print("error_rate_test_1[%f]" % i), error_rate_test_1[i] print "time consumed:", time.time() - start_time file_2.write("FA_error_rate_train_1") for i in range(0, len(error_rate_train_1)): file_2.write(";") file_2.write("%1.9f" % error_rate_train_1[i])
#print(goodness) goods = goods + [goodness] goods = pd.DataFrame(goods) avg = pd.concat([avg,goods],axis=1) print(avg) ''' ''' fa = FeatureAgglomeration(n_clusters=7).fit(X) newdata = fa.fit_transform(X) newdata = pd.DataFrame(newdata) print(X.head(10)) print(newdata.head(10)) ''' fa = FeatureAgglomeration(n_clusters=5).fit(X) newdata = fa.transform(X) recon = fa.inverse_transform(newdata) recon = pd.DataFrame(recon) print(reconError(X, recon)) print(pd.DataFrame(fa.labels_)) print(fa.n_leaves_) print(fa.n_components) print(pd.DataFrame(fa.children_)) ''' #Finds the K that maximizes AR score goods = [] for i in range(2,20): labels = KMeans(n_clusters=i).fit(newdata).labels_ labels_true = Y.tolist() goodness = metrics.adjusted_rand_score(labels_true,labels) goods.append([i,goodness])
""" class sklearn.cluster.FeatureAgglomeration(n_clusters = 2, affinity = "euclidean", memory = None, connectivity = None, compute_full_tree = "auto", linkage = "ward", pooling_func = <function mean>) """ # ======================================================================== # data # ======================================================================== digits = datasets.load_digits() images = digits.images X = np.reshape(images, (len(images), -1)) print(X.shape) # ======================================================================== # 降维 # ======================================================================== agglo = FeatureAgglomeration(n_clusters=32) agglo.fit(X) print(agglo.labels_) print(agglo.n_leaves_) print(agglo.n_components_) print(agglo.children_) X_reduced = agglo.transform(X) print(X_reduced.shape)
for component in range(1, len(X_train[0])+1): grp = GaussianRandomProjection(n_components=component, random_state=1) X_train_reduced = grp.fit_transform(X_train) X_test_reduced = grp.transform(X_test) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train_reduced, y_train) train_scores.append(knn.score(X_train_reduced, y_train)) test_scores.append(knn.score(X_test_reduced, y_test)) if dataset_name=='spam': drawMultiple([train_scores, test_scores], 'KNN Accuracy over Randomized Projected Components (Spam dataset)', 'Number of Projected Components', 'Accuracy', ['projected train score','projected test score'], range(1, len(X_train[0])+1)) elif dataset_name=='letter': drawMultiple([train_scores, test_scores], 'KNN Accuracy over Randomized Projected Components (Letter Recognition dataset)', 'Number of Projected Components', 'Accuracy', ['projected train score','projected test score'], range(1, len(X_train[0])+1)) #FA train_scores=[] test_scores=[] for component in range(1, len(X_train[0])+1): fa = FeatureAgglomeration(n_clusters=component) X_train_reduced = fa.fit_transform(X_train) X_test_reduced = fa.transform(X_test) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train_reduced, y_train) train_scores.append(knn.score(X_train_reduced, y_train)) test_scores.append(knn.score(X_test_reduced, y_test)) if dataset_name=='spam': drawMultiple([train_scores, test_scores], 'KNN Accuracy over Feature Agglomeration Components (Spam dataset)', 'Number of Agglomerated Components', 'Accuracy', ['Agglomerated train score','Agglomerated test score'], range(1, len(X_train[0])+1)) elif dataset_name=='letter': drawMultiple([train_scores, test_scores], 'KNN Accuracy over Feature Agglomeration Components (Letter Recognition dataset)', 'Number of Agglomerated Components', 'Accuracy', ['Agglomerated train score','Agglomerated test score'], range(1, len(X_train[0])+1))
def main(): # Parameters data_directory = '../data/generated-data-r-10-n-8-2/' features_path = '../data/features-generated-data-r-10-n-8-2' booking_file = '../data/booking.csv' users_file = '../data/user.csv' rating_thresholds = [] true_objects_indexes = [0, 1, 2, 3, 4, 5, 6, 7] false_objects_indexes = [8, 9] file_names = os.listdir(data_directory) img_ids_vector = [int(name.split('-')[0]) for name in file_names] ratings_vector = [int(name.split('-')[-2]) for name in file_names] name_vector = [data_directory + name for name in file_names] images_indexes = [name.split('-')[3].split('.')[0] for name in file_names] ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds) features = get_features(features_path, name_vector) fa = FeatureAgglomeration(n_clusters=50) fa.fit(features) features = fa.transform(features) scores = [] cv_results_file = './results/bf_real.csv' #ratings_matrix = ratings_matrix[:30, :30] #selection = BasicFactorization(show_selection_results=False, selection_algorithm='random') #selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, # users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, # true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, # paths=name_vector, z_score=True) #score, score_rmse = selection.evaluate(evaluation_metric='auc') #scores.append(score) #exit() # K Nearest Neighbors #cv_results_file = './results/cv-generated-data-nr-2-n-02-l-100-knn.csv' scores_auc = [] scores_rmse = [] for i in range(1): cv_results_file = './results/xxp1-cv-generated-data-r-10-n-8-2-random-' + str( i) + '.csv' selection = ObjectSelection(show_selection_results=False, selection_algorithm='random') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, z_score=False) selection.evaluate(evaluation_metric='auc') selection.evaluate(evaluation_metric='rmse') print('\n\n-----\n\n') score_auc, score_rmse = selection.evaluate(evaluation_metric='auc') scores_auc.append(score_auc) scores_rmse.append(score_rmse) results_file = './scores/v-generated-data-r-10-n-8-2-random-fa-auc.csv' save_scores(scores_auc, results_file) results_file = './scores/v-generated-data-r-10-n-8-2-random-fa-rmse.csv' save_scores(scores_rmse, results_file) exit() for i in range(10): print() for _ in range(0): selection = ObjectSelection(show_selection_results=False, selection_algorithm='random') # selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file) selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, z_score=True) print('\n\n-----\n\n') score_auc, score_rmse = selection.evaluate(evaluation_metric='auc') scores.append(score_auc) for i in range(10): print() for _ in range(10): selection = BasicFactorization(show_selection_results=False, selection_algorithm='random') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector) score = selection.evaluate(evaluation_metric='auc') scores.append(score) exit() # Parameters #data_directory = '../data/experience-6/' #features_path = '../data/features-experience-6' data_directory = '../data/generated-data-r-2-n-8-2/' features_path = '../data/features-generated-data-r-2-n-8-2' booking_file = '../data/booking.csv' users_file = '../data/user.csv' cv_results_file = 'results/cv-generated-data-r-2-n-8-2-x.csv' true_objects_indexes = [0, 1, 2, 3, 4, 5, 6, 7] false_objects_indexes = [8, 9] #file_to_delete = data_directory + '.DS_Store' #os.remove(file_to_delete) file_names = os.listdir(data_directory) img_ids_vector = [int(name.split('-')[0]) for name in file_names] ratings_vector = [int(name.split('-')[-2]) for name in file_names] name_vector = [data_directory + name for name in file_names] images_indexes = [name.split('-')[3].split('.')[0] for name in file_names] rating_thresholds = [1, 2] #rating_thresholds = [] ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds, binary=True) features = get_features(features_path, name_vector) cv_results_file = './results/cv-generated-data-r-2-n-8-2-knn-y.csv' selection = ObjectSelection(show_selection_results=False, selection_algorithm='random') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, use_user_data=True) selection.evaluate(evaluation_metric='auc') exit() selection = BasicFactorizationNmf(show_selection_results=True, selection_algorithm='random') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector) selection.evaluate(evaluation_metric='auc')
first_plot = plot_roi(labels_img, mean_func_img, title="Ward parcellation", display_mode='xz') # labels_img is a Nifti1Image object, it can be saved to file with the # following code: labels_img.to_filename('parcellation.nii') # Display the original data plot_epi(nifti_masker.inverse_transform(fmri_masked[0]), cut_coords=first_plot.cut_coords, title='Original (%i voxels)' % fmri_masked.shape[1], display_mode='xz') # A reduced data can be create by taking the parcel-level average: # Note that, as many objects in the scikit-learn, the ward object exposes # a transform method that modifies input features. Here it reduces their # dimension fmri_reduced = ward.transform(fmri_masked) # Display the corresponding data compressed using the parcellation fmri_compressed = ward.inverse_transform(fmri_reduced) compressed_img = nifti_masker.inverse_transform(fmri_compressed[0]) plot_epi(compressed_img, cut_coords=first_plot.cut_coords, title='Compressed representation (2000 parcels)', display_mode='xz') plt.show()
(clases_historial_alim == clase_target_inicial[0]).mean(), 2) print( '\n\nPorcentaje de días con alimentación dentro de la clase del target nutricional utilizando alimentos de marcas:', porcentaje_hist_clase_target, '%') else: ############################################################################################# ## COMIENZO DEL ALGORITMO DE IA PARA ELEGIR ALIMENTOS EN BASE A LA INGESTA RECOMENDADA ## ############################################################################################# feature_agglom = FeatureAgglomeration(n_clusters=cluster_v[-1]) feature_agglom.fit(nut_data) features_reduced = feature_agglom.transform(nut_data) aux_features_max = np.max(features_reduced, axis=0) aux_features_min = np.min(features_reduced, axis=0) target_nut_norm = ((feature_agglom.transform( np.expand_dims(target_nut / masa_inicial_comidas_g * 100, axis=0)) - aux_features_min) / (aux_features_max - aux_features_min))[0] pendiente_aprendizaje_v = [] recompensa_m = [] for N_alimentos in N_alimentos_v: # Creación de la carpeta para salvar las Figuras y datos de cada conjunto de alimentos por separado
# Second, we illustrate the effect that the clustering has on the # signal. We show the original data, and the approximation provided by # the clustering by averaging the signal on each parcel. # # As you can see below, this approximation is very good, although there # are only 2000 parcels, instead of the original 60000 voxels # Display the original data plot_epi(nifti_masker.inverse_transform(fmri_masked[0]), cut_coords=cut_coords, title='Original (%i voxels)' % fmri_masked.shape[1], vmax=fmri_masked.max(), vmin=fmri_masked.min(), display_mode='xz') # A reduced data can be create by taking the parcel-level average: # Note that, as many objects in the scikit-learn, the ward object exposes # a transform method that modifies input features. Here it reduces their # dimension fmri_reduced = ward.transform(fmri_masked) # Display the corresponding data compressed using the parcellation fmri_compressed = ward.inverse_transform(fmri_reduced) compressed_img = nifti_masker.inverse_transform(fmri_compressed[0]) plot_epi(compressed_img, cut_coords=cut_coords, title='Compressed representation (2000 parcels)', vmax=fmri_masked.max(), vmin=fmri_masked.min(), display_mode='xz') show()