Example #1
0
def test_list_feature_union_transform(X):
    """Check that a ``ListFeatureUnion`` of two projections gives the same
    result as stacking the projections."""
    list_dim = [0, 1]
    p_1_2 = ListFeatureUnion([("proj" + str(k), Projection(columns=k))
                              for k in list_dim])
    p12 = Projection(columns=list_dim)
    for p in [p12, p_1_2]:
        p.fit(X)
    x_12 = p12.transform(X)
    x_1_2 = np.concatenate(p_1_2.transform(X), axis=1)

    assert_almost_equal(x_12, x_1_2)
Example #2
0
    def _runMapper(self):
        """
        creates mapper graphs based on train data

        :return: None
        """
        log.debug("--->creating mappers...")
        if not self.remake and os.path.exists(TEMP_DATA + "%s_firstsimplegap_graphs" % self.label):
            fgin = open(TEMP_DATA + "%s_firstsimplegap_graphs" % self.label, "rb")
            self.graphs = pickle.load(fgin)

            fpin = open(TEMP_DATA + "%s_mapper_pipes" % self.label, "rb")
            self.mapper_pipes = pickle.load(fpin)
            return

        clusterer = FirstSimpleGap()
        self.mapper_pipes = []

        log.debug("------> creating projection components...")

        for k in range(self.n_components):
            log.debug("---------> on component {}/{}...".format(k + 1, self.n_components))
            proj = Projection(columns=k)
            filter_func = Pipeline(steps=[('pca', self.rep), ('proj', proj)])
            filtered_data = filter_func.fit_transform(self.data)
            cover = OneDimensionalCover(n_intervals=self.n_intervals, overlap_frac=self.overlap_frac, kind='balanced')
            cover.fit(filtered_data)
            mapper_pipe = make_mapper_pipeline(scaler=None,
                                               filter_func=filter_func,
                                               cover=cover,
                                               clusterer=clusterer,
                                               verbose=(log.getEffectiveLevel() == logging.DEBUG),
                                               n_jobs=1)
            mapper_pipe.set_params(filter_func__proj__columns=k)
            self.mapper_pipes.append(("PCA%d" % (k + 1), mapper_pipe))

        # try parallelization
        log.debug("------> entering parallelization...")

        self.graphs = [mapper_pipe[1].fit_transform(self.data) for mapper_pipe in self.mapper_pipes]

        #
        # self.graphs = Parallel(n_jobs=5, prefer="threads")(
        #     delayed(mapper_pipe[1].fit_transform)(self.data) for mapper_pipe in self.mapper_pipes
        # )

        fg = open(TEMP_DATA + "%s_firstsimplegap_graphs" % self.label, "wb")
        pickle.dump(self.graphs, fg)
        fg.close()

        fp = open(TEMP_DATA + "%s_mapper_pipes" % self.label, "wb")
        pickle.dump(self.mapper_pipes, fp)
        fp.close()
Example #3
0
def test_contract_nodes():
    """Test that, on a pathological dataset, we generate a graph without edges
    when `contract_nodes` is set to False and with edges when it is set to
    True."""
    X = make_circles(n_samples=2000)[0]

    filter_func = Projection()
    cover = OneDimensionalCover(n_intervals=5, overlap_frac=0.4)
    p = filter_func.fit_transform(X)
    m = cover.fit_transform(p)

    gap = 0.1
    idx_to_remove = []
    for i in range(m.shape[1] - 1):
        inters = np.logical_and(m[:, i], m[:, i + 1])
        inters_idx = np.flatnonzero(inters)
        p_inters = p[inters_idx]
        min_p, max_p = np.min(p_inters), np.max(p_inters)
        idx_to_remove += list(np.flatnonzero((min_p <= p)
                                             & (p <= min_p + gap)))
        idx_to_remove += list(np.flatnonzero((max_p - gap <= p)
                                             & (p <= max_p)))

    X_f = X[[x for x in range(len(X)) if x not in idx_to_remove]]

    clusterer = DBSCAN(eps=0.05)
    pipe = make_mapper_pipeline(filter_func=filter_func,
                                cover=cover,
                                clusterer=clusterer,
                                contract_nodes=True)
    graph = pipe.fit_transform(X_f)
    assert not len(graph.es)

    pipe.set_params(contract_nodes=False)
    graph = pipe.fit_transform(X_f)
    assert len(graph.es)
Example #4
0
def test_projection_values_equal_slice(X):
    """Test the logic of the ``Projection`` transformer."""
    columns = np.random.choice(
        X.shape[1], 1 + np.random.randint(X.shape[1]))
    Xt = Projection(columns=columns).fit_transform(X)
    assert_almost_equal(Xt, X[:, columns])
Example #5
0
def main():
    directory = DOTENV_KEY2VAL["DATA_DIR"]
    image_dir = directory + "/patch_92/"
    diagnosis_json = "collected_diagnoses_complete.json"

    (
        cn_patients,
        mci_patients,
        ad_patients,
    ) = utils.get_earliest_available_diagnosis(directory + diagnosis_json)
    images_all = utils.get_arrays_from_dir(
        image_dir, cn_patients + mci_patients + ad_patients)

    cn_patient_list = [
        1 for patient in range(len(cn_patients) - 1)
    ]  # substracting one due to unfound MRI for one CN patient
    mci_patient_list = [2 for patient in range(len(mci_patients))]
    ad_patient_list = [3 for patient in range(len(ad_patients))]

    diags = np.array(cn_patient_list + mci_patient_list +
                     ad_patient_list).reshape(-1, 1)
    ohe = OneHotEncoder()
    labels = ohe.fit_transform(diags).toarray()

    images = []
    for image in images_all:
        images.append(image.flatten())
    images_all = np.asarray(images)
    pca = PCA(n_components=440)
    pca.fit(images_all)

    fig, ax0 = plt.subplots(nrows=1, sharex=True, figsize=(6, 6))
    ax0.plot(
        np.arange(1, pca.n_components_ + 1),
        pca.explained_variance_ratio_,
        "+",
        linewidth=2,
    )
    ax0.set_ylabel("PCA explained variance ratio")
    ax0.legend(prop=dict(size=12))
    plt.savefig(DOTENV_KEY2VAL["GEN_FIGURES_DIR"] + "elbow_plot.png")

    n_components = 3
    pca = PCA(n_components=n_components)
    images_all_projected = pca.fit_transform(images_all)

    images_all_projected = np.append(images_all_projected, labels, axis=1)

    mapper_pipeline = make_mapper_pipeline(
        filter_func=Projection(columns=[index for index in range(2)]),
        cover=CubicalCover(n_intervals=10, overlap_frac=0.25),
        clusterer=DBSCAN(eps=0.5, min_samples=5),
        verbose=True,
        n_jobs=4,
    )
    plotly_params = {"node_trace": {"marker_colorscale": "Blues"}}
    fig = plot_static_mapper_graph(
        mapper_pipeline,
        images_all_projected,
        layout_dim=3,
        color_by_columns_dropdown=True,
        plotly_params=plotly_params,
    )

    fig.write_html(DOTENV_KEY2VAL["GEN_FIGURES_DIR"] +
                   "mapper_2_dimensional_reduction.html")

    images_all_projected = pd.DataFrame(images_all_projected)
    fig = px.scatter_3d(
        images_all_projected,
        x=0,
        y=1,
        z=2,
        color=3,
        title="3D scatterplot of the PCA of the image data",
    )
    fig.write_html(DOTENV_KEY2VAL["GEN_FIGURES_DIR"] +
                   "scatterplot_pca_3d.html")