def fa_dim_red(x_train_scaled, dataset_name, features_num = 2): z=0 losses = [] for k in range(1, x_train_scaled.shape[1]+1): fa = FeatureAgglomeration(n_clusters=k) fa_result = fa.fit_transform(x_train_scaled) x_projected_fa = fa.inverse_transform(fa_result) loss = ((x_train_scaled - x_projected_fa) ** 2).mean() losses.append(loss) np_feature_losses_percent = np.multiply(100, losses/np.sum(losses)) print('num of clustrs < 10% loss') for i in range(len(np_feature_losses_percent)): z=z+np_feature_losses_percent[i] if z>90: print(i+1) break print(np_feature_losses_percent) plt.bar(list(range(1,len(np_feature_losses_percent)+1)),np_feature_losses_percent) plt.title("FeatureAgglomeration Projection Losses % ("+str(dataset_name)+")") plt.ylabel("Mean Squared Error (% of Total)") plt.xlabel("Features") plt.savefig((str(dataset_name))+' fa analysis.png') plt.show() fa = FeatureAgglomeration(n_clusters=features_num) fa_result = fa.fit_transform(x_train_scaled, y_train) print(fa_result.shape) x_projected_fa = fa.inverse_transform(fa_result) print(x_projected_ica.shape) print(x_train_scaled.shape) loss = ((x_train_scaled - x_projected_fa) ** 2).mean() print('loss') print(loss) return fa_result,x_projected_fa
def test_feature_agglomeration(): n_clusters = 1 X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean) agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median) assert_no_warnings(agglo_mean.fit, X) assert_no_warnings(agglo_median.fit, X) assert np.size(np.unique(agglo_mean.labels_)) == n_clusters assert np.size(np.unique(agglo_median.labels_)) == n_clusters assert np.size(agglo_mean.labels_) == X.shape[1] assert np.size(agglo_median.labels_) == X.shape[1] # Test transform Xt_mean = agglo_mean.transform(X) Xt_median = agglo_median.transform(X) assert Xt_mean.shape[1] == n_clusters assert Xt_median.shape[1] == n_clusters assert Xt_mean == np.array([1 / 3.]) assert Xt_median == np.array([0.]) # Test inverse transform X_full_mean = agglo_mean.inverse_transform(Xt_mean) X_full_median = agglo_median.inverse_transform(Xt_median) assert np.unique(X_full_mean[0]).size == n_clusters assert np.unique(X_full_median[0]).size == n_clusters assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean) assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def test_feature_agglomoration(self, n, data, corpus, docvecs): print("Using feature agglomoration to reduce the matrix' dimensionality...") affinities = ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"] linkages = ["ward", "complete", "average"] agglos = [] for linkage in linkages: if linkage is "ward": agglos.append(FeatureAgglomeration(n_clusters=n, affinity="euclidean", linkage=linkage)) else: for affinity in affinities: agglos.append(FeatureAgglomeration(n_clusters=n, affinity=affinity, linkage=linkage)) for agglo in agglos: print(agglo.get_params) reduced_vectors = agglo.fit_transform(data) clusters_kmeans = self.cluster_kmeans(docvecs, reduced_vectors=reduced_vectors, feature_names=corpus) labels_db = self.cluster_dbscan(reduced_vectors) #labels_hdb = self.cluster_hdbscan(reduced_vectors) clusters_db = self.get_clusters(corpus, labels_db) #clusters_hdb = self.get_clusters(corpus, labels_hdb) #agglo = FeatureAgglomeration(n_clusters=n, affinity="euclidean", linkage="ward") #return agglo.fit_transform(data) return
def TrainRFRegression(df1): # generate the equation to use for our design matrices eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29']) # build our design matrices y, X = dmatrices(eqn, data=df1, return_type='dataframe') # employ clustering to reduce our dimensionality X_reduction = FeatureAgglomeration(n_clusters=50).fit(X, pd.np.ravel(y)) reduced_X = X_reduction.transform(X) # define our regressor mod = RandomForestRegressor(n_estimators=50) # fit our data res = mod.fit(reduced_X, pd.np.ravel(y)) # evaluate our fit yp = pd.DataFrame({'predicted': res.predict(reduced_X)}) yp = yp['predicted'] yt = y['regressand'] r2 = metrics.r2_score(yt, yp) rmse = metrics.mean_absolute_error(yt, yp) # save our model, including scalers and feature agglomerator with open('RFR_trained_model.pickle', 'wb') as output: pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) return r2, rmse
def test_linkage_misc(): # Misc tests on linkage rnd = np.random.RandomState(42) X = rnd.normal(size=(5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foo') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # Deprecation of Ward class with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", DeprecationWarning) Ward().fit(X) assert_equal(len(warning_list), 1) # test hiearchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hiearchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", DeprecationWarning) if hasattr(np, 'VisibleDeprecationWarning'): # Let's not catch the numpy internal DeprecationWarnings warnings.simplefilter('ignore', np.VisibleDeprecationWarning) ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) assert_equal(len(warning_list), 1) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def test_linkage_misc(): # Misc tests on linkage X = np.ones((5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foobar').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foobar') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # Use the copy argument, to raise a warning Ward(copy=True).fit(X) # We should be getting 2 warnings: one for using Ward that is # deprecated, one for using the copy argument assert_equal(len(warning_list), 2) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # Use the copy argument, to raise a warning ward_tree(X, copy=True) # We should be getting 1 warnings: for using the copy argument assert_equal(len(warning_list), 1) # Let's test a hiearchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with ignore_warnings(): ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def Reduction(X_train, alg_name): train_scores=[] test_scores=[] times=[] print('algorithm:', alg_name) for component in range(1, len(X_train[0])+1): if component % 10 == 0: print(component) if alg_name == 'pca': alg = PCA(n_components=component, random_state=1) elif alg_name == 'ica': alg = FastICA(random_state=1, n_components=component) elif alg_name == 'rp': alg = GaussianRandomProjection(n_components=component, random_state=1) elif alg_name == 'fa': alg = FeatureAgglomeration(n_clusters=component) else: break X_train_reduced=alg.fit_transform(X_train) X_test_reduced=alg.transform(X_test) start_time = time.time() train_score, test_score = NN(X_train_reduced, X_test_reduced) times.append((time.time() - start_time)) train_scores.append(train_score) test_scores.append(test_score) return train_scores, test_scores, times
def feature_agglomeration(voters_data, n, rounding=False): featagg = FeatureAgglomeration(n_clusters=n) featagg.fit(voters_data) condensed = featagg.transform(voters_data) feature_groups_map = dict(zip(voters_data.columns, featagg.labels_)) feature_groups_nos = [] for feature_group_key in feature_groups_map: feature_groups_nos.append(feature_groups_map[feature_group_key]) feature_groups_nos group_labels = [] for feature_group_no in set(feature_groups_nos): group_label = "" for feature_groups_key in feature_groups_map: if feature_groups_map[feature_groups_key] == feature_group_no: group_label = group_label + feature_groups_key + ", " group_labels.append(group_label[0:-2]) group_labels voters_agglomerated = pd.DataFrame(condensed, columns=group_labels, index=voters_data.index) if rounding == True: voters_agglomerated = voters_agglomerated.applymap(lambda x: round(x)) print("🔹→💠←🔹 {} features agglomerated into {} hybrid features.".format( len(voters_data.columns), len(voters_agglomerated.columns))) return voters_agglomerated
def TestSGDRegression(df1): # generate the equation to use for our design matrices eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29', 'X13']) # build our design matrices X = dmatrix(eqn.replace('regressand ~ ', '0+'), data=df1, return_type='dataframe') # load our model, including scalers and feature agglomerator with open('SGD_trained_model.pickle', 'rb') as input: res = pickle.load(input) # employ clustering to reduce our dimensionality X_reduction = FeatureAgglomeration(n_clusters=n_clusters).fit(X) reduced_X = X_reduction.transform(X) # standardize our data X_scaler = StandardScaler().fit(reduced_X) std_X = X_scaler.transform(reduced_X) # predict the interest rates yp = res.predict(std_X) return yp
def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs): from sklearn.cluster import FeatureAgglomeration if self.pooling_func == "mean": pooling_func = np.mean elif self.pooling_func == "median": pooling_func = np.median elif self.pooling_func == "max": pooling_func = np.max else: raise ValueError(f'Unknown pooling function \'{self.pooling_func}\'') if self.distance_threshold is not None: n_clusters = None self.compute_full_tree = True else: if isinstance(self.n_clusters_factor, int): n_clusters = self.n_clusters_factor else: n_clusters = max(min(resolve_factor(self.n_clusters_factor, n_features, default=2, cs_default=1.), (n_features - 1)), 2) return FeatureAgglomeration(n_clusters=n_clusters, affinity=self.affinity, compute_full_tree=self.compute_full_tree, linkage=self.linkage, distance_threshold=self.distance_threshold, pooling_func=pooling_func)
def makeSpeakerGridPlots(sarcasmDf, bertFeats=None, show=False): tformFile = './data/transformData.pkl' if bertFeats is None: with open(tformFile, 'rb') as ifile: dataMap = pkl.load(ifile) else: print('Regenerating transform data...') dataMap = { 'PCA': PCA().fit_transform(bertFeats), 'TSNE': TSNE().fit_transform(bertFeats), 'Agglomeration': FeatureAgglomeration().fit_transform(bertFeats), 'Gaussian Projection': random_projection.GaussianRandomProjection(2).fit_transform( bertFeats), 'Sparse Projection': random_projection.SparseRandomProjection(2).fit_transform( bertFeats) } with open(tformFile, 'wb') as ofile: pkl.dump(dataMap, ofile) for combo in ('speaker', 'sarcasm'), ('sarcasm', 'speaker'): for tform in dataMap: tfData = dataMap[tform] grid = makeDataPlots(tfData, sarcasmDf, *combo, tform) if show: grid.show() title = grid.windowTitle() saveGrid(grid, imgDir / f'{title}.jpg')
def cluster(self, cluster_images, cluster_db_path, diffnet_paht='model_checkpoints/', save_csv=False): compressions = [] # Finding features diffnet = DiffNet(self.db, db_path=self.db_path) diffnet.restore(diffnet_paht) print("Calculating features for", len(cluster_images), "images") for img in cluster_images: print("Finding features for:", img) one_hot = diffnet.feedforward(img, cluster_db_path) output = self.sess.run(self.compressed, feed_dict={self.Q: [one_hot], self.keep_prob: 1.}) compressions.append(output[0]) # Clustering print("Performing clustering...") compressions = np.array(compressions) fa = FeatureAgglomeration(n_clusters=30) X_clusters = fa.fit_transform(compressions) print("Collecting data...") csv_dict_arr = [] for i, img in enumerate(cluster_images): csv_dict_arr.append({'1.img': img, '2.class': np.argmax(X_clusters[i]), '3.features': compressions[i]}) # Saving if save_csv: print("Saving data to csv...") keys = load_label_list(csv_dict_arr[0]) with open('cluster_result.csv', 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys, delimiter=';') dict_writer.writeheader() dict_writer.writerows(csv_dict_arr) return csv_dict_arr
def get_transform(algorithm): """ Defines and returns a feature selection transform object of the designated type. Parameters ---------- algorithm : {'pca', 'kpca', 'grp', 'fa', 'k_best'} Transform algorithm to return an object. Returns ---------- transform : object Instantiated transform object. """ if algorithm == 'pca': transform = PCA() elif algorithm == 'kpca': transform = KernelPCA() elif algorithm == 'grp': transform = GaussianRandomProjection() elif algorithm == 'fa': transform = FeatureAgglomeration() elif algorithm == 'k_best': transform = SelectKBest(mutual_info_regression) else: raise Exception( 'No selection algorithm defined for {0}'.format(algorithm)) return transform
def FeatureSelection(df,numeric_cols , corrCoefThres=0.9): numeric_cols = numeric_cols numdf = df[numeric_cols] r_in_x = numdf.corr() r_in_x = abs(r_in_x) distance_in_x = 1 / r_in_x for i in range(r_in_x.shape[0]): distance_in_x.iloc[i, i] = 10 ^ 10 cpdist = distance_in_x.copy() cpdist = cpdist.fillna(cpdist.max().max()) #df.isna().sum() from scipy.spatial.distance import correlation from sklearn.cluster import FeatureAgglomeration corrcoefmin = corrCoefThres fa = FeatureAgglomeration(n_clusters=None,affinity="precomputed",compute_full_tree=True, linkage="average" ,distance_threshold=1/corrcoefmin) fa.fit(cpdist) numdf.shape[1] fa.n_clusters_ fadf = pd.DataFrame({"feature":numdf.columns.values , "label":fa.labels_}) selectedFeatures = fadf.groupby("label").head(1)["feature"].values return selectedFeatures
def feature_agg(self, Data, size=5): clus = FeatureAgglomeration(n_clusters=size).fit(np.array(scale(Data))) self.features_clusters = [] self.features_name = list(Data) for i in range(size): self.features_clusters.append( np.array(self.features_name)[np.where(clus.labels_ == i)])
def createPipe(embed, classif, nmca, aggregation, nsubs): # Dimension Reduction n_comp = 20 if nsubs > 70 else 15 if embed == "pca": emb = ('pca', PCA(n_components=n_comp)) else: emb = ('fa', FeatureAgglomeration(n_clusters=n_comp)) # Classifiers neib = int(nmca * nsubs * 0.1) if aggregation == "mega" else int(nsubs * 0.1) clfs = { 'svc': ('svc', SVC(class_weight="balanced", probability=True, max_iter=1e6)), 'knn': ('knn', KNeighborsClassifier(n_neighbors=neib)), 'rfc': ('rfc', RandomForestClassifier(class_weight="balanced")), 'ada': ('ada', AdaBoostClassifier()), 'lrc': ('lrc', LogisticRegression(class_weight="balanced", solver='liblinear', max_iter=1e6)) } pipe = Pipeline(steps=[emb, clfs[classif]]) return pipe
def build_impl(self): newconfig = self.config.copy() if newconfig['linkage'] == 'ward': newconfig['affinity'] = 'euclidean' newconfig['n_clusters'] = newconfig.pop( 'n_components') # Replace key name. self.model = FeatureAgglomeration(**newconfig)
def comput_coefs(self, X, y, size): cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)]) # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1)) coef_selection_ = coef_.reshape(size, size) return dict( coef_selection_=coef_selection_, coef_agglomeration_=coef_agglomeration_, cachedir=cachedir )
def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) with pytest.raises(ValueError): AgglomerativeClustering(linkage='foo').fit(X) with pytest.raises(ValueError): linkage_tree(X, linkage='foo') with pytest.raises(ValueError): linkage_tree(X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # test hierarchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hierarchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def feature_agglomeration(X, args={}): """ 使用层次聚类对特征进行聚类,然后进行特征降维 """ from sklearn.cluster import FeatureAgglomeration fam = FeatureAgglomeration(**args) fam.fit(X) return fam
def untangle(X: Iterable, y: Iterable, n_clusters: int = None, get_connectivity: bool = True, compute_distances: bool = True, kind: str = 'correlation', agglo_kws: Union[dict, Bunch] = None) -> FeatureAgglomeration: from nilearn.connectome import ConnectivityMeasure as CM from sklearn.cluster import FeatureAgglomeration from sklearn.covariance import LedoitWolf from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import mutual_info_classif agglo_defs = dict(affinity='euclidean', compute_full_tree='auto', linkage='ward', pooling_func=np.mean, distance_threshold=None, compute_distances=compute_distances) if get_connectivity is True: connect_mat = CM(LedoitWolf(), kind=kind).fit_transform([X.values])[0] else: connect_mat = None if n_clusters is None: n_clusters = divmod(X.shape[1], 2)[0] - 1 if n_clusters == 0: n_clusters = 1 if agglo_kws is None: agglo_kws = {} agglo_defs.update(agglo_kws) agglo = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connect_mat, **agglo_defs) if not isinstance(y, pd.Series): y = pd.Series(y) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) agglo.fit(X, y) setattr( agglo, 'cluster_indexes_', pd.DataFrame(zip(agglo.labels_, agglo.feature_names_in_), columns=['cluster', 'feature']).groupby('cluster').feature) skb = SelectKBest(k=1, score_func=mutual_info_classif) factor_leaders_ = [ skb.fit(X[itm[1]], y).get_feature_names_out()[0] for itm in tuple(agglo.cluster_indexes_) ] setattr(agglo, 'factor_leaders_', factor_leaders_) return agglo
def featureagglomeration(data_train, data_test, label_train, label_test, args): print('feature agglomeration') FA = FeatureAgglomeration(n_clusters=10).fit(data_train) transformation = FA.transform(data_test) agglomeration = find_highest(transformation) print('feature agglomeration done') compare_class(agglomeration, label_test) if args.create_mean: create_images_from_rows('fa', mean_image(agglomeration, data_test))
def token_cluster(self, n_clusters=300): from scipy import sparse from sklearn.cluster import FeatureAgglomeration FA = FeatureAgglomeration(n_clusters=3000) self.bow_corpus = FA.fit_transform(self.bow_corpus) self.bow_corpus = sparse.csr_matrix(self.bow_corpus)
def get_encoder(metas, train_data, target_output_dim): tmpdir = metas['workspace'] model_path = os.path.join(tmpdir, 'feature_agglomeration.model') model = FeatureAgglomeration(n_clusters=target_output_dim) model.fit(train_data) pickle.dump(model, open(model_path, 'wb')) return FeatureAgglomerationEncoder(model_path=model_path)
def do_feature_agglomoration(self, data): print("Using feature agglomoration to reduce the matrix' dimensionality...") if self.k: n = self.k else: n = 20 agglo = FeatureAgglomeration(n_clusters=n, affinity="cosine", linkage="complete") return agglo.fit_transform(data)
def agglo_fn(X): from sklearn.cluster import FeatureAgglomeration import pandas as pd import matplotlib.pyplot as plt if X.shape != (7501, 6): X = np.transpose(X) agglo = FeatureAgglomeration(n_clusters=1).fit_transform(X) return agglo
def test_feature_agglomeration_feature_names_out(): """Check `get_feature_names_out` for `FeatureAgglomeration`.""" X, _ = make_blobs(n_features=6, random_state=0) agglo = FeatureAgglomeration(n_clusters=3) agglo.fit(X) n_clusters = agglo.n_clusters_ names_out = agglo.get_feature_names_out() assert_array_equal([f"featureagglomeration{i}" for i in range(n_clusters)], names_out)
def makePipeline(self, classifier, n_clusters): """Makes a pipeline, necessary for adding in unsupervised learning preprocessing step. """ estimators = [('reduce_dim', FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean')), ('main_classifier', classifier)] clf = Pipeline(estimators) return clf