def test_list_feature_union_transform(X): """Check that a ``ListFeatureUnion`` of two projections gives the same result as stacking the projections.""" list_dim = [0, 1] p_1_2 = ListFeatureUnion([("proj" + str(k), Projection(columns=k)) for k in list_dim]) p12 = Projection(columns=list_dim) for p in [p12, p_1_2]: p.fit(X) x_12 = p12.transform(X) x_1_2 = np.concatenate(p_1_2.transform(X), axis=1) assert_almost_equal(x_12, x_1_2)
def _runMapper(self): """ creates mapper graphs based on train data :return: None """ log.debug("--->creating mappers...") if not self.remake and os.path.exists(TEMP_DATA + "%s_firstsimplegap_graphs" % self.label): fgin = open(TEMP_DATA + "%s_firstsimplegap_graphs" % self.label, "rb") self.graphs = pickle.load(fgin) fpin = open(TEMP_DATA + "%s_mapper_pipes" % self.label, "rb") self.mapper_pipes = pickle.load(fpin) return clusterer = FirstSimpleGap() self.mapper_pipes = [] log.debug("------> creating projection components...") for k in range(self.n_components): log.debug("---------> on component {}/{}...".format(k + 1, self.n_components)) proj = Projection(columns=k) filter_func = Pipeline(steps=[('pca', self.rep), ('proj', proj)]) filtered_data = filter_func.fit_transform(self.data) cover = OneDimensionalCover(n_intervals=self.n_intervals, overlap_frac=self.overlap_frac, kind='balanced') cover.fit(filtered_data) mapper_pipe = make_mapper_pipeline(scaler=None, filter_func=filter_func, cover=cover, clusterer=clusterer, verbose=(log.getEffectiveLevel() == logging.DEBUG), n_jobs=1) mapper_pipe.set_params(filter_func__proj__columns=k) self.mapper_pipes.append(("PCA%d" % (k + 1), mapper_pipe)) # try parallelization log.debug("------> entering parallelization...") self.graphs = [mapper_pipe[1].fit_transform(self.data) for mapper_pipe in self.mapper_pipes] # # self.graphs = Parallel(n_jobs=5, prefer="threads")( # delayed(mapper_pipe[1].fit_transform)(self.data) for mapper_pipe in self.mapper_pipes # ) fg = open(TEMP_DATA + "%s_firstsimplegap_graphs" % self.label, "wb") pickle.dump(self.graphs, fg) fg.close() fp = open(TEMP_DATA + "%s_mapper_pipes" % self.label, "wb") pickle.dump(self.mapper_pipes, fp) fp.close()
def test_contract_nodes(): """Test that, on a pathological dataset, we generate a graph without edges when `contract_nodes` is set to False and with edges when it is set to True.""" X = make_circles(n_samples=2000)[0] filter_func = Projection() cover = OneDimensionalCover(n_intervals=5, overlap_frac=0.4) p = filter_func.fit_transform(X) m = cover.fit_transform(p) gap = 0.1 idx_to_remove = [] for i in range(m.shape[1] - 1): inters = np.logical_and(m[:, i], m[:, i + 1]) inters_idx = np.flatnonzero(inters) p_inters = p[inters_idx] min_p, max_p = np.min(p_inters), np.max(p_inters) idx_to_remove += list(np.flatnonzero((min_p <= p) & (p <= min_p + gap))) idx_to_remove += list(np.flatnonzero((max_p - gap <= p) & (p <= max_p))) X_f = X[[x for x in range(len(X)) if x not in idx_to_remove]] clusterer = DBSCAN(eps=0.05) pipe = make_mapper_pipeline(filter_func=filter_func, cover=cover, clusterer=clusterer, contract_nodes=True) graph = pipe.fit_transform(X_f) assert not len(graph.es) pipe.set_params(contract_nodes=False) graph = pipe.fit_transform(X_f) assert len(graph.es)
def test_projection_values_equal_slice(X): """Test the logic of the ``Projection`` transformer.""" columns = np.random.choice( X.shape[1], 1 + np.random.randint(X.shape[1])) Xt = Projection(columns=columns).fit_transform(X) assert_almost_equal(Xt, X[:, columns])
def main(): directory = DOTENV_KEY2VAL["DATA_DIR"] image_dir = directory + "/patch_92/" diagnosis_json = "collected_diagnoses_complete.json" ( cn_patients, mci_patients, ad_patients, ) = utils.get_earliest_available_diagnosis(directory + diagnosis_json) images_all = utils.get_arrays_from_dir( image_dir, cn_patients + mci_patients + ad_patients) cn_patient_list = [ 1 for patient in range(len(cn_patients) - 1) ] # substracting one due to unfound MRI for one CN patient mci_patient_list = [2 for patient in range(len(mci_patients))] ad_patient_list = [3 for patient in range(len(ad_patients))] diags = np.array(cn_patient_list + mci_patient_list + ad_patient_list).reshape(-1, 1) ohe = OneHotEncoder() labels = ohe.fit_transform(diags).toarray() images = [] for image in images_all: images.append(image.flatten()) images_all = np.asarray(images) pca = PCA(n_components=440) pca.fit(images_all) fig, ax0 = plt.subplots(nrows=1, sharex=True, figsize=(6, 6)) ax0.plot( np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, "+", linewidth=2, ) ax0.set_ylabel("PCA explained variance ratio") ax0.legend(prop=dict(size=12)) plt.savefig(DOTENV_KEY2VAL["GEN_FIGURES_DIR"] + "elbow_plot.png") n_components = 3 pca = PCA(n_components=n_components) images_all_projected = pca.fit_transform(images_all) images_all_projected = np.append(images_all_projected, labels, axis=1) mapper_pipeline = make_mapper_pipeline( filter_func=Projection(columns=[index for index in range(2)]), cover=CubicalCover(n_intervals=10, overlap_frac=0.25), clusterer=DBSCAN(eps=0.5, min_samples=5), verbose=True, n_jobs=4, ) plotly_params = {"node_trace": {"marker_colorscale": "Blues"}} fig = plot_static_mapper_graph( mapper_pipeline, images_all_projected, layout_dim=3, color_by_columns_dropdown=True, plotly_params=plotly_params, ) fig.write_html(DOTENV_KEY2VAL["GEN_FIGURES_DIR"] + "mapper_2_dimensional_reduction.html") images_all_projected = pd.DataFrame(images_all_projected) fig = px.scatter_3d( images_all_projected, x=0, y=1, z=2, color=3, title="3D scatterplot of the PCA of the image data", ) fig.write_html(DOTENV_KEY2VAL["GEN_FIGURES_DIR"] + "scatterplot_pca_3d.html")