def TrainRFRegression(df1): # generate the equation to use for our design matrices eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29']) # build our design matrices y, X = dmatrices(eqn, data=df1, return_type='dataframe') # employ clustering to reduce our dimensionality X_reduction = FeatureAgglomeration(n_clusters=50).fit(X, pd.np.ravel(y)) reduced_X = X_reduction.transform(X) # define our regressor mod = RandomForestRegressor(n_estimators=50) # fit our data res = mod.fit(reduced_X, pd.np.ravel(y)) # evaluate our fit yp = pd.DataFrame({'predicted': res.predict(reduced_X)}) yp = yp['predicted'] yt = y['regressand'] r2 = metrics.r2_score(yt, yp) rmse = metrics.mean_absolute_error(yt, yp) # save our model, including scalers and feature agglomerator with open('RFR_trained_model.pickle', 'wb') as output: pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) return r2, rmse
def test_feature_agglomoration(self, n, data, corpus, docvecs): print("Using feature agglomoration to reduce the matrix' dimensionality...") affinities = ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"] linkages = ["ward", "complete", "average"] agglos = [] for linkage in linkages: if linkage is "ward": agglos.append(FeatureAgglomeration(n_clusters=n, affinity="euclidean", linkage=linkage)) else: for affinity in affinities: agglos.append(FeatureAgglomeration(n_clusters=n, affinity=affinity, linkage=linkage)) for agglo in agglos: print(agglo.get_params) reduced_vectors = agglo.fit_transform(data) clusters_kmeans = self.cluster_kmeans(docvecs, reduced_vectors=reduced_vectors, feature_names=corpus) labels_db = self.cluster_dbscan(reduced_vectors) #labels_hdb = self.cluster_hdbscan(reduced_vectors) clusters_db = self.get_clusters(corpus, labels_db) #clusters_hdb = self.get_clusters(corpus, labels_hdb) #agglo = FeatureAgglomeration(n_clusters=n, affinity="euclidean", linkage="ward") #return agglo.fit_transform(data) return
class Regressor(BaseEstimator): def __init__(self): self.clf = Pipeline([ ("RF", RandomForestRegressor(n_estimators=200, max_depth=15, n_jobs=N_JOBS))]) self.scaler = StandardScaler() self.agglo = FeatureAgglomeration(n_clusters=500) def fit(self, X, y): y = y.ravel() n_samples, n_lags, n_lats, n_lons = X.shape self.scaler.fit(X[:, -1].reshape(n_samples, -1)) X = X.reshape(n_lags * n_samples, -1) connectivity = grid_to_graph(n_lats, n_lons) self.agglo.connectivity = connectivity X = self.scaler.transform(X) X = self.agglo.fit_transform(X) X = X.reshape(n_samples, -1) self.clf.fit(X, y) def predict(self, X): n_samples, n_lags, n_lats, n_lons = X.shape X = X.reshape(n_lags * n_samples, -1) X = self.scaler.transform(X) X = self.agglo.transform(X) X = X.reshape(n_samples, -1) return self.clf.predict(X)
def FeatureSelection(df,numeric_cols , corrCoefThres=0.9): numeric_cols = numeric_cols numdf = df[numeric_cols] r_in_x = numdf.corr() r_in_x = abs(r_in_x) distance_in_x = 1 / r_in_x for i in range(r_in_x.shape[0]): distance_in_x.iloc[i, i] = 10 ^ 10 cpdist = distance_in_x.copy() cpdist = cpdist.fillna(cpdist.max().max()) #df.isna().sum() from scipy.spatial.distance import correlation from sklearn.cluster import FeatureAgglomeration corrcoefmin = corrCoefThres fa = FeatureAgglomeration(n_clusters=None,affinity="precomputed",compute_full_tree=True, linkage="average" ,distance_threshold=1/corrcoefmin) fa.fit(cpdist) numdf.shape[1] fa.n_clusters_ fadf = pd.DataFrame({"feature":numdf.columns.values , "label":fa.labels_}) selectedFeatures = fadf.groupby("label").head(1)["feature"].values return selectedFeatures
def test_feature_agglomeration(): n_clusters = 1 X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean) agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median) assert_no_warnings(agglo_mean.fit, X) assert_no_warnings(agglo_median.fit, X) assert np.size(np.unique(agglo_mean.labels_)) == n_clusters assert np.size(np.unique(agglo_median.labels_)) == n_clusters assert np.size(agglo_mean.labels_) == X.shape[1] assert np.size(agglo_median.labels_) == X.shape[1] # Test transform Xt_mean = agglo_mean.transform(X) Xt_median = agglo_median.transform(X) assert Xt_mean.shape[1] == n_clusters assert Xt_median.shape[1] == n_clusters assert Xt_mean == np.array([1 / 3.]) assert Xt_median == np.array([0.]) # Test inverse transform X_full_mean = agglo_mean.inverse_transform(Xt_mean) X_full_median = agglo_median.inverse_transform(Xt_median) assert np.unique(X_full_mean[0]).size == n_clusters assert np.unique(X_full_median[0]).size == n_clusters assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean) assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def feature_agglomeration(voters_data, n, rounding=False): featagg = FeatureAgglomeration(n_clusters=n) featagg.fit(voters_data) condensed = featagg.transform(voters_data) feature_groups_map = dict(zip(voters_data.columns, featagg.labels_)) feature_groups_nos = [] for feature_group_key in feature_groups_map: feature_groups_nos.append(feature_groups_map[feature_group_key]) feature_groups_nos group_labels = [] for feature_group_no in set(feature_groups_nos): group_label = "" for feature_groups_key in feature_groups_map: if feature_groups_map[feature_groups_key] == feature_group_no: group_label = group_label + feature_groups_key + ", " group_labels.append(group_label[0:-2]) group_labels voters_agglomerated = pd.DataFrame(condensed, columns=group_labels, index=voters_data.index) if rounding == True: voters_agglomerated = voters_agglomerated.applymap(lambda x: round(x)) print("๐นโ๐ โ๐น {} features agglomerated into {} hybrid features.".format( len(voters_data.columns), len(voters_agglomerated.columns))) return voters_agglomerated
def cluster(self, cluster_images, cluster_db_path, diffnet_paht='model_checkpoints/', save_csv=False): compressions = [] # Finding features diffnet = DiffNet(self.db, db_path=self.db_path) diffnet.restore(diffnet_paht) print("Calculating features for", len(cluster_images), "images") for img in cluster_images: print("Finding features for:", img) one_hot = diffnet.feedforward(img, cluster_db_path) output = self.sess.run(self.compressed, feed_dict={self.Q: [one_hot], self.keep_prob: 1.}) compressions.append(output[0]) # Clustering print("Performing clustering...") compressions = np.array(compressions) fa = FeatureAgglomeration(n_clusters=30) X_clusters = fa.fit_transform(compressions) print("Collecting data...") csv_dict_arr = [] for i, img in enumerate(cluster_images): csv_dict_arr.append({'1.img': img, '2.class': np.argmax(X_clusters[i]), '3.features': compressions[i]}) # Saving if save_csv: print("Saving data to csv...") keys = load_label_list(csv_dict_arr[0]) with open('cluster_result.csv', 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys, delimiter=';') dict_writer.writeheader() dict_writer.writerows(csv_dict_arr) return csv_dict_arr
def _feature_agglomeration_fit_method(data, n_parcels, connectivity, linkage): """Feature Agglomeration algorithm to fit on the data. Parameters ---------- data : array_like, shape=(n_samples, n_voxels) Masked subjects data n_parcels : int Number of parcels to parcellate. connectivity : ndarray Connectivity matrix Defines for each feature the neighbouring features following a given structure of the data. linkage : str which linkage criterion to use. 'ward' or 'linkage' or 'average' Returns ------- labels : ndarray Labels to the data """ ward = FeatureAgglomeration(n_clusters=n_parcels, connectivity=connectivity, linkage=linkage) ward.fit(data) return ward.labels_
def TestSGDRegression(df1): # generate the equation to use for our design matrices eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29', 'X13']) # build our design matrices X = dmatrix(eqn.replace('regressand ~ ', '0+'), data=df1, return_type='dataframe') # load our model, including scalers and feature agglomerator with open('SGD_trained_model.pickle', 'rb') as input: res = pickle.load(input) # employ clustering to reduce our dimensionality X_reduction = FeatureAgglomeration(n_clusters=n_clusters).fit(X) reduced_X = X_reduction.transform(X) # standardize our data X_scaler = StandardScaler().fit(reduced_X) std_X = X_scaler.transform(reduced_X) # predict the interest rates yp = res.predict(std_X) return yp
def feature_agglomeration(X, args={}): """ ไฝฟ็จๅฑๆฌก่็ฑปๅฏน็นๅพ่ฟ่ก่็ฑป๏ผ็ถๅ่ฟ่ก็นๅพ้็ปด """ from sklearn.cluster import FeatureAgglomeration fam = FeatureAgglomeration(**args) fam.fit(X) return fam
class FeatureAgglomerationDecomposer(Transformer): type = 11 def __init__(self, n_clusters=2, affinity='euclidean', linkage='ward', pooling_func='mean', random_state=1): super().__init__("feature_agglomeration_decomposer") self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL] self.compound_mode = 'only_new' self.output_type = NUMERICAL self.n_clusters = n_clusters self.affinity = affinity self.linkage = linkage self.pooling_func = pooling_func self.random_state = random_state self.pooling_func_mapping = dict(mean=np.mean, median=np.median, max=np.max) @ease_trans def operate(self, input_datanode, target_fields=None): from sklearn.cluster import FeatureAgglomeration X, y = input_datanode.data if self.model is None: self.n_clusters = int(self.n_clusters) n_clusters = min(self.n_clusters, X.shape[1]) if not callable(self.pooling_func): self.pooling_func = self.pooling_func_mapping[self.pooling_func] self.model = FeatureAgglomeration( n_clusters=n_clusters, affinity=self.affinity, linkage=self.linkage, pooling_func=self.pooling_func) self.model.fit(X) X_new = self.model.transform(X) return X_new @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): cs = ConfigurationSpace() n_clusters = UniformIntegerHyperparameter("n_clusters", 2, 400, default_value=25) affinity = CategoricalHyperparameter( "affinity", ["euclidean", "manhattan", "cosine"], default_value="euclidean") linkage = CategoricalHyperparameter( "linkage", ["ward", "complete", "average"], default_value="ward") pooling_func = CategoricalHyperparameter( "pooling_func", ["mean", "median", "max"], default_value="mean") cs.add_hyperparameters([n_clusters, affinity, linkage, pooling_func]) affinity_and_linkage = ForbiddenAndConjunction( ForbiddenInClause(affinity, ["manhattan", "cosine"]), ForbiddenEqualsClause(linkage, "ward")) cs.add_forbidden_clause(affinity_and_linkage) return cs
def untangle(X: Iterable, y: Iterable, n_clusters: int = None, get_connectivity: bool = True, compute_distances: bool = True, kind: str = 'correlation', agglo_kws: Union[dict, Bunch] = None) -> FeatureAgglomeration: from nilearn.connectome import ConnectivityMeasure as CM from sklearn.cluster import FeatureAgglomeration from sklearn.covariance import LedoitWolf from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import mutual_info_classif agglo_defs = dict(affinity='euclidean', compute_full_tree='auto', linkage='ward', pooling_func=np.mean, distance_threshold=None, compute_distances=compute_distances) if get_connectivity is True: connect_mat = CM(LedoitWolf(), kind=kind).fit_transform([X.values])[0] else: connect_mat = None if n_clusters is None: n_clusters = divmod(X.shape[1], 2)[0] - 1 if n_clusters == 0: n_clusters = 1 if agglo_kws is None: agglo_kws = {} agglo_defs.update(agglo_kws) agglo = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connect_mat, **agglo_defs) if not isinstance(y, pd.Series): y = pd.Series(y) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) agglo.fit(X, y) setattr( agglo, 'cluster_indexes_', pd.DataFrame(zip(agglo.labels_, agglo.feature_names_in_), columns=['cluster', 'feature']).groupby('cluster').feature) skb = SelectKBest(k=1, score_func=mutual_info_classif) factor_leaders_ = [ skb.fit(X[itm[1]], y).get_feature_names_out()[0] for itm in tuple(agglo.cluster_indexes_) ] setattr(agglo, 'factor_leaders_', factor_leaders_) return agglo
def featureagglomeration(data_train, data_test, label_train, label_test, args): print('feature agglomeration') FA = FeatureAgglomeration(n_clusters=10).fit(data_train) transformation = FA.transform(data_test) agglomeration = find_highest(transformation) print('feature agglomeration done') compare_class(agglomeration, label_test) if args.create_mean: create_images_from_rows('fa', mean_image(agglomeration, data_test))
def get_encoder(metas, train_data, target_output_dim): tmpdir = metas['workspace'] model_path = os.path.join(tmpdir, 'feature_agglomeration.model') model = FeatureAgglomeration(n_clusters=target_output_dim) model.fit(train_data) pickle.dump(model, open(model_path, 'wb')) return FeatureAgglomerationEncoder(model_path=model_path)
def do_feature_agglomoration(self, data): print("Using feature agglomoration to reduce the matrix' dimensionality...") if self.k: n = self.k else: n = 20 agglo = FeatureAgglomeration(n_clusters=n, affinity="cosine", linkage="complete") return agglo.fit_transform(data)
def token_cluster(self, n_clusters=300): from scipy import sparse from sklearn.cluster import FeatureAgglomeration FA = FeatureAgglomeration(n_clusters=3000) self.bow_corpus = FA.fit_transform(self.bow_corpus) self.bow_corpus = sparse.csr_matrix(self.bow_corpus)
def get_clusters(X: pd.DataFrame, n_clusters: int): clt = FeatureAgglomeration(n_clusters=n_clusters) clt.fit(X) clusters = [] for i in range(n_clusters): clusters.append(X.columns[clt.labels_ == i].tolist()) return clusters # type: list[str]
def dim_reduction_FA(data, distance_threshold=0.45): """ Params: data: ndarry of shape (n_samples, n_features) distance_threshold: Optimal threshold value for similarity measure Returns: (reducedDimData, nReducedComponents) """ agglo = FeatureAgglomeration(n_clusters=None,distance_threshold=distance_threshold) reducedDimData = agglo.fit_transform(data) return reducedDimData, agglo.n_clusters_
def apply_feature_agglomeration(table, features, label, n_components): from sklearn.cluster import FeatureAgglomeration from paje import feature_file_processor x, y = feature_file_processor.split_features_target(table, features, label) fa = FeatureAgglomeration(n_clusters=n_components, linkage='ward') pc = fa.fit_transform(x) return feature_file_processor.generate_data_frame(pc, table[[label]])
def test_feature_agglomeration_feature_names_out(): """Check `get_feature_names_out` for `FeatureAgglomeration`.""" X, _ = make_blobs(n_features=6, random_state=0) agglo = FeatureAgglomeration(n_clusters=3) agglo.fit(X) n_clusters = agglo.n_clusters_ names_out = agglo.get_feature_names_out() assert_array_equal([f"featureagglomeration{i}" for i in range(n_clusters)], names_out)
def main(): # Parameters data_directory = '../../data/generated-data-r-10-n-6-4/' features_path = '../../data/features-generated-data-r-10-n-6-4' booking_file = '../../data/booking.csv' users_file = '../../data/user.csv' rating_thresholds = [] true_objects_indexes = [0, 1, 2, 3, 4, 5] false_objects_indexes = [6, 7, 8, 9] file_names = os.listdir(data_directory) img_ids_vector = [int(name.split('-')[0]) for name in file_names] ratings_vector = [int(name.split('-')[-2]) for name in file_names] name_vector = [data_directory + name for name in file_names] images_indexes = [name.split('-')[3].split('.')[0] for name in file_names] ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds) features = get_features(features_path, name_vector) fa = FeatureAgglomeration(n_clusters=50) fa.fit(features) features = fa.transform(features) scores_auc = [] scores_rmse = [] for i in range(10): cv_results_file = '../results/cv-generated-data-r-10-n-6-4-rf-fa-' + str( i) + '.csv' selection = ObjectSelection(show_selection_results=False, selection_algorithm='rf') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, z_score=False) selection.evaluate(evaluation_metric='auc') selection.evaluate(evaluation_metric='rmse') print('\n\n-----\n\n') score_auc, score_rmse = selection.evaluate(evaluation_metric='auc') scores_auc.append(score_auc) scores_rmse.append(score_rmse) results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-auc.csv' save_scores(scores_auc, results_file) results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-rmse.csv' save_scores(scores_rmse, results_file)
def getDR(dt_all, n_comp=12): # cols cols_encode_label = dt_all.filter( regex="Encode_Label").columns.values.tolist() cols_cat = dt_all.drop( "ID", axis=1).select_dtypes(include=["object"]).columns.tolist() # standardize dt_all_norm = MinMaxScaler().fit_transform( dt_all.drop(["y", "Fold"] + cols_cat + cols_encode_label, axis=1)) # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results = tsvd.fit_transform(dt_all_norm) # PCA pca = PCA(n_components=n_comp, random_state=420) pca_results = pca.fit_transform(dt_all_norm) # ICA ica = FastICA(n_components=n_comp, max_iter=5000, random_state=420) ica_results = ica.fit_transform(dt_all_norm) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results = grp.fit_transform(dt_all_norm) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results = srp.fit_transform(dt_all_norm) # NMF nmf = NMF(n_components=n_comp, init='nndsvdar', random_state=420) nmf_results = nmf.fit_transform(dt_all_norm) # F*G f*g = FeatureAgglomeration(n_clusters=n_comp, linkage='ward') fag_results = f*g.fit_transform(dt_all_norm) # Append decomposition components to datasets for i in range(1, n_comp + 1): dt_all['DR_TSVD_' + str(i)] = tsvd_results[:, i - 1] dt_all['DR_PCA_' + str(i)] = pca_results[:, i - 1] dt_all['DR_ICA_' + str(i)] = ica_results[:, i - 1] dt_all['DR_GRP_' + str(i)] = grp_results[:, i - 1] dt_all['DR_SRP_' + str(i)] = srp_results[:, i - 1] dt_all['DR_NMF_' + str(i)] = nmf_results[:, i - 1] dt_all['DR_FAG_' + str(i)] = fag_results[:, i - 1] return (dt_all)
def setup(self, keywords={}): """ Setup the algorithms """ for p in keywords.keys(): setattr(self, p, keywords[p]) if self.method == "agglomerative": self.obj = AgglomerativeClustering(n_clusters=self.n_clusters, linkage=self.linkage, affinity=self.affinity) if self.method == "feature": self.obj = FeatureAgglomeration(n_clusters=self.n_clusters, linkage=self.linkage, affinity=self.affinity, distance_threshold=self.distance_threshold) return
def variable_clustering(self, X_cat, woe_iv_df, n_clusters=15): X_transformed = mt.BinWoe().transform_x_all(X_cat, woe_iv_df) agglo = FeatureAgglomeration(n_clusters=n_clusters) if len(X_transformed) > 20000: X_agglo = X_transformed.sample(20000) else: X_agglo = X_transformed.copy() agglo.fit(X_agglo) vars_clusters = pd.DataFrame(data={'ๆๆ ่ฑๆ':X_transformed.columns.tolist(), 'cluster':list(agglo.labels_)})\ .sort_values('cluster') return vars_clusters, X_transformed
def cont_feature_clusters_sklearn(self, n_clusters = 5): """ This uses feature agglomeration from scikit learn and only works for continuous variables Eventually expand this to categorical variables using Cramer's V covariance matrix similar to R tool using the iclust package """ #Import the library from sklearn.cluster import FeatureAgglomeration Cluster = FeatureAgglomeration(n_clusters=n_clusters) Cluster.fit(self._dataset.iloc[:,self._cont_index_predictors]) df = pd.DataFrame({'Variable':self._dataset.columns[self._cont_index_predictors], 'Cluster':Cluster.labels_}) return df.sort_values(by='Cluster')
def fa_dim_red(x_train_scaled, dataset_name, features_num = 2): z=0 losses = [] for k in range(1, x_train_scaled.shape[1]+1): fa = FeatureAgglomeration(n_clusters=k) fa_result = fa.fit_transform(x_train_scaled) x_projected_fa = fa.inverse_transform(fa_result) loss = ((x_train_scaled - x_projected_fa) ** 2).mean() losses.append(loss) np_feature_losses_percent = np.multiply(100, losses/np.sum(losses)) print('num of clustrs < 10% loss') for i in range(len(np_feature_losses_percent)): z=z+np_feature_losses_percent[i] if z>90: print(i+1) break print(np_feature_losses_percent) plt.bar(list(range(1,len(np_feature_losses_percent)+1)),np_feature_losses_percent) plt.title("FeatureAgglomeration Projection Losses % ("+str(dataset_name)+")") plt.ylabel("Mean Squared Error (% of Total)") plt.xlabel("Features") plt.savefig((str(dataset_name))+' fa analysis.png') plt.show() fa = FeatureAgglomeration(n_clusters=features_num) fa_result = fa.fit_transform(x_train_scaled, y_train) print(fa_result.shape) x_projected_fa = fa.inverse_transform(fa_result) print(x_projected_ica.shape) print(x_train_scaled.shape) loss = ((x_train_scaled - x_projected_fa) ** 2).mean() print('loss') print(loss) return fa_result,x_projected_fa
def _ward_fit_transform(all_subjects_data, fit_samples_indices, connectivity, n_parcels, offset_labels): """Ward clustering algorithm on a subsample and apply to the whole dataset. Computes a brain parcellation using Ward's clustering algorithm on some images, then averages the signal within parcels in order to reduce the dimension of the images of the whole dataset. This function is used with Randomized Parcellation Based Inference, so we need to save the labels to further perform the inverse transformation operation. The function therefore needs an offset to be applied on the labels so that they are unique across parcellations. Parameters ---------- all_subjects_data : array_like, shape=(n_samples, n_voxels) Masked subject images as an array. fit_samples_indices : array-like, Indices of the samples used to compute the parcellation. connectivity : scipy.sparse.coo_matrix, Graph representing the spatial structure of the images (i.e. connections between voxels). n_parcels : int, Number of parcels for the parcellations. offset_labels : int, Offset for labels numbering. The purpose is to have different labels in all the parcellations that can be built by multiple calls to the current function. Returns ------- parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels) Average signal within each parcel for each subject. labels : np.ndarray, shape=(n_voxels,) Labels giving the correspondance between voxels and parcels. """ # fit part data_fit = all_subjects_data[fit_samples_indices] ward = FeatureAgglomeration(n_clusters=n_parcels, connectivity=connectivity) ward.fit(data_fit) # transform part labels = ward.labels_ + offset_labels # unique labels across parcellations parcelled_data = ward.transform(all_subjects_data) return parcelled_data, labels
def test_feature_agglomeration(): n_clusters = 1 X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean) agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median) assert_no_warnings(agglo_mean.fit, X) assert_no_warnings(agglo_median.fit, X) assert_true(np.size(np.unique(agglo_mean.labels_)) == n_clusters) assert_true(np.size(np.unique(agglo_median.labels_)) == n_clusters) assert_true(np.size(agglo_mean.labels_) == X.shape[1]) assert_true(np.size(agglo_median.labels_) == X.shape[1]) # Test transform Xt_mean = agglo_mean.transform(X) Xt_median = agglo_median.transform(X) assert_true(Xt_mean.shape[1] == n_clusters) assert_true(Xt_median.shape[1] == n_clusters) assert_true(Xt_mean == np.array([1 / 3.])) assert_true(Xt_median == np.array([0.])) # Test inverse transform X_full_mean = agglo_mean.inverse_transform(Xt_mean) X_full_median = agglo_median.inverse_transform(Xt_median) assert_true(np.unique(X_full_mean[0]).size == n_clusters) assert_true(np.unique(X_full_median[0]).size == n_clusters) assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean) assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def cluster_sentences(sentences, nb_of_clusters=5): tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer, stop_words=stopwords.words('english'), max_df=0.9, min_df=0.05, lowercase=True) #builds a tf-idf matrix for the sentences tfidf_matrix_1 = tfidf_vectorizer.fit_transform(sentences) tfidf_matrix = tfidf_matrix_1.todense() kmeans = FeatureAgglomeration(n_clusters=nb_of_clusters) kmeans.fit(tfidf_matrix) clusters = collections.defaultdict(list) for i, label in enumerate(kmeans.labels_): clusters[label].append(i) return dict(clusters)
def data_compression(fmri_masked, mask_img, mask_np, output_size): """ data : array_like A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints The functional dataset that needs to be reduced mask : a numpy array of the mask output_size : integer The number of elements that the data should be reduced to """ ## Transform nifti files to a data matrix with the NiftiMasker import time from nilearn import input_data datacompressiontime = time.time() nifti_masker = input_data.NiftiMasker(mask_img=mask_img, memory='nilearn_cache', mask_strategy='background', memory_level=1, standardize=False) ward = [] # Perform Ward clustering from sklearn.feature_extraction import image shape = mask_np.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask_np) #import pdb;pdb.set_trace() from sklearn.cluster import FeatureAgglomeration start = time.time() ward = FeatureAgglomeration(n_clusters=output_size, connectivity=connectivity, linkage='ward') ward.fit(fmri_masked) #print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start)) labels = ward.labels_ #print ('Extracting reduced Dimension Data') data_reduced = ward.transform(fmri_masked) fmri_masked = [] #print('Data compression took ', (time.time()- datacompressiontime), ' seconds') return {'data': data_reduced, 'labels': labels}
def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) with pytest.raises(ValueError): AgglomerativeClustering(linkage='foo').fit(X) with pytest.raises(ValueError): linkage_tree(X, linkage='foo') with pytest.raises(ValueError): linkage_tree(X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # test hierarchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hierarchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def comput_coefs(self, X, y, size): cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)]) # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1)) coef_selection_ = coef_.reshape(size, size) return dict( coef_selection_=coef_selection_, coef_agglomeration_=coef_agglomeration_, cachedir=cachedir )
def createPipe(embed, classif, nmca, aggregation, nsubs): # Dimension Reduction n_comp = 20 if nsubs > 70 else 15 if embed == "pca": emb = ('pca', PCA(n_components=n_comp)) else: emb = ('fa', FeatureAgglomeration(n_clusters=n_comp)) # Classifiers neib = int(nmca * nsubs * 0.1) if aggregation == "mega" else int(nsubs * 0.1) clfs = { 'svc': ('svc', SVC(class_weight="balanced", probability=True, max_iter=1e6)), 'knn': ('knn', KNeighborsClassifier(n_neighbors=neib)), 'rfc': ('rfc', RandomForestClassifier(class_weight="balanced")), 'ada': ('ada', AdaBoostClassifier()), 'lrc': ('lrc', LogisticRegression(class_weight="balanced", solver='liblinear', max_iter=1e6)) } pipe = Pipeline(steps=[emb, clfs[classif]]) return pipe
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", DeprecationWarning) if hasattr(np, 'VisibleDeprecationWarning'): # Let's not catch the numpy internal DeprecationWarnings warnings.simplefilter('ignore', np.VisibleDeprecationWarning) ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) assert_equal(len(warning_list), 1) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with ignore_warnings(): ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args): W = [] features = [] if selection_method != '2step_kbest': n_features = min(n_features, len(feature_list)) if estimator_method == 'svm' and selection_method == 'rfe': estimator_args['kernel'] = 'linear' estimator = ESTIMATORS[estimator_method](**estimator_args) if selection_method == 'cluster': agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average') clusters = agglom.fit_predict(X).tolist() sample = [clusters.index(i) for i in range(n_features)] X = X[:,sample] Z = Z[:,sample] selection_method = None if selection_method is None: for i, y in enumerate(Y): estimator.fit(X, y) w = estimator.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'rfe': selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args) for i, y in enumerate(Y): selector = selector.fit(X, y) features.append(feature_list[selector.support_]) w = selector.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'myrfe': selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args) for i, y in enumerate(Y): selector.fit(X, y) features.append(feature_list[selector.support]) w = selector.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'kbest': selector = SelectKBest(f_regression, k=n_features, **selection_args) for i, y in enumerate(Y): X2 = selector.fit_transform(X, y) Z2 = selector.transform(Z) features.append(feature_list[selector.get_support()]) estimator.fit(X2, y) w = estimator.predict(Z2) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', print return W, features
def __init__(self): self.clf = Pipeline([ ("RF", RandomForestRegressor(n_estimators=200, max_depth=15, n_jobs=N_JOBS))]) self.scaler = StandardScaler() self.agglo = FeatureAgglomeration(n_clusters=500)
# Transpose and scale parameters featListOriginal[[0,1,5,7,9,10],:] = featListOriginal[[0,1,5,7,9,10],:]*0.8 featListOriginal[[2,3,4],:] = featListOriginal[[2,3,4],:]*0.8*0.2 featListOriginal[6,:] = featListOriginal[6,:]*0.8*0.8 featListOriginal = NP.transpose(featListOriginal) ## STANDARDIZE FEATURES ################################################### # Don't standardize the centroids for k in range(2,numFeat): featList[k] = (featList[k] - NP.mean(featList[k]))/NP.sqrt(NP.var(featList[k])) # Transpose the feature list to use in clustering featList = NP.transpose(featList) feat_aggl = FeatureAgglomeration(2) feat_aggl.fit(featList[:,2:]) ## AGGLOMERATIVE CLUSTERING ############################################### aggl_all = AgglomerativeClustering(2) X_All = featList[:,2:] y2 = aggl_all.fit_predict(X_All) ## PCA ############################################################### pca_model = PCA(2) X_PCA = pca_model.fit_transform(X_All) print(pca_model.explained_variance_ratio_) ## SPLIT INTO numBINS ###############################################
def FeatureAgglomeration(self, clist, numClusters=2): FEATAGGL = FeatureAgglomeration(numClusters) FEATAGGL.fit(self.featList[:,clist]) self.featureTree = FEATAGGL.children_ self.featureLabels = FEATAGGL.labels_ self.featureCList = clist
from sklearn.naive_bayes import BernoulliNB from sklearn.cluster import FeatureAgglomeration from sklearn.metrics import roc_curve, auc from sklearn.cross_validation import KFold from sklearn import metrics ############################################################################### # Data IO and generation # import some data to play with #file = "/home/kbhalla/Desktop/Data/day_samp_new.npy" file = "/home/rmendoza/Documents/Data/day_samp_new_0604.npy" with open(file, "r") as file_in: matrix = smio.load_sparse_csr(file_in) X = matrix[:,:-1] FA = FeatureAgglomeration(n_clusters=250) print np.shape(X) y = matrix[:,-1] X = FA.fit_transform(X,y) n_samples, n_features = X.shape k = int(0.8*n_samples) #random_state = np.random.RandomState(0) #X = np.c_[X, random_state.randn(n_samples, 2*n_features)] X_test, y_test = X[k:,:], y[k:] X, y = X[:k, :], y[:k] sm = SMOTE(ratio=0.95) X,y = sm.fit_sample(X, y) print np.shape(X) start = time.time()
################################################################## # Then we use FeatureAgglomeration from scikit-learn. Indeed, the voxels # are the features of the data matrix. # # In addition, we use caching. As a result, the clustering doesn't have # to be recomputed later. # Computing the ward for the first time, this is long... from sklearn.cluster import FeatureAgglomeration # If you have scikit-learn older than 0.14, you need to import # WardAgglomeration instead of FeatureAgglomeration import time start = time.time() ward = FeatureAgglomeration(n_clusters=1000, connectivity=connectivity, linkage='ward', memory='nilearn_cache') ward.fit(fmri_masked) print("Ward agglomeration 1000 clusters: %.2fs" % (time.time() - start)) # Compute the ward with more clusters, should be faster as we are using # the caching mechanism start = time.time() ward = FeatureAgglomeration(n_clusters=2000, connectivity=connectivity, linkage='ward', memory='nilearn_cache') ward.fit(fmri_masked) print("Ward agglomeration 2000 clusters: %.2fs" % (time.time() - start)) ################################################################## # Visualize results # ------------------ #
def find_cluster(childrens, start, height, sample_size): res = start for i in range(height - 1): res = find_feature(childrens, res + sample_size) cluster = rec_cluster(childrens, childrens[res], sample_size) cluster.sort() return cluster def find_feature_cluster(children, feature, height, sample_size): return find_cluster(children, find_feature(children, feature), height, sample_size) BENCH_DATA = genfromtxt('Sequential_Application_SATUNSAT_track_wo_names.csv', delimiter=',') #BENCH_DATA = BENCH_DATA.transpose() print(BENCH_DATA.shape) #print(np.isnan(BENCH_DATA).any()) ward = FeatureAgglomeration(linkage='average') #print(ward.fit_predict(BENCH_DATA)) ward.fit(BENCH_DATA) #print(ward.children_) #print(find_feature_cluster(ward.children_, 0, 2, 300)) plt.title('SAT Feature_Agglomeration') plot_dendrogram(ward, leaf_font_size = 12) #plt.savefig('SAT_Feature_Agglomeration.png') plt.show()