Exemple #1
0
def test_on_trivial_input(inp):
    """Test that with one cluster, and one point, we always get one cluster,
    regardless of its location."""
    n_points_per_cluster, n_clusters, dim, pts = inp
    fs = FirstSimpleGap()
    fs = fs.fit(pts)
    assert fs.n_clusters_ == n_clusters

    fh = FirstHistogramGap()
    fh = fh.fit(pts)
    assert fh.n_clusters_ == n_clusters
Exemple #2
0
def test_max_fraction_clusters(inp, max_frac):
    """ Check that ``FirstSimpleGap`` and ``FirstHistogramGap`` respect the
    ``max_num_clusters`` constraint, if it is set."""
    n_points_per_cluster, n_clusters, _, pts = inp
    max_num_clusters = max_frac * n_points_per_cluster * n_clusters

    fs = FirstSimpleGap(max_fraction=max_frac)
    _ = fs.fit_predict(pts)
    assert fs.n_clusters_ <= np.floor(max_num_clusters)

    fh = FirstHistogramGap(max_fraction=max_frac)
    _ = fh.fit_predict(pts)
    assert fh.n_clusters_ <= np.floor(max_num_clusters)
Exemple #3
0
def test_firstsimplegap(inp):
    """For a multimodal distribution, check that ``FirstSimpleGap`` with
    appropriate parameters finds the right number of clusters, and that each
    has the correct number of points ``n_points_per_cluster``."""
    n_points_per_cluster, n_clusters, _, pts = inp
    fs = FirstSimpleGap(relative_gap_size=0.5,
                        max_fraction=1.,
                        affinity='euclidean',
                        memory=None,
                        linkage='single')
    preds = fs.fit_predict(pts).astype(int)
    unique, counts = np.unique(preds, return_counts=True)
    # check that the nb of clusters corresponds to the nb of synth. clusters
    assert unique.shape[0] == n_clusters
    # check that the nb of pts in a cluster corresponds to what we expect
    assert_almost_equal(counts, n_points_per_cluster)
def test_pipeline_cloned(X, clone_pipeline, layout_dim,
                         color_by_columns_dropdown):
    """Verify that the pipeline is changed on interaction if and only if
    `clone_pipeline` is False (with `layout_dim` set to 2 or 3)."""
    # TODO: Monitor development of the ipytest project to convert these into
    # true notebook tests integrated with pytest
    params = {
        "cover": {
            "initial": {"n_intervals": 10, "kind": "uniform",
                        "overlap_frac": 0.1},
            "new": {"n_intervals": 15, "kind": "balanced", "overlap_frac": 0.2}
            },
        "clusterer": {
            "initial": {"affinity": "euclidean"},
            "new": {"affinity": "manhattan"}
            },
        "contract_nodes": {"initial": True, "new": False},
        "min_intersection": {"initial": 4, "new": 1},
        }

    pipe = make_mapper_pipeline(
        cover=CubicalCover(**params["cover"]["initial"]),
        clusterer=FirstSimpleGap(**params["clusterer"]["initial"]),
        contract_nodes=params["contract_nodes"]["initial"],
        min_intersection=params["min_intersection"]["initial"]
        )
    fig = plot_interactive_mapper_graph(
        pipe, X, clone_pipeline=clone_pipeline, layout_dim=layout_dim,
        color_by_columns_dropdown=color_by_columns_dropdown
        )

    # Get relevant widgets and change their states, then check final values
    for step, values in params.items():
        if step in ["cover", "clusterer"]:
            for param_name, initial_param_value in values["initial"].items():
                new_param_value = values["new"][param_name]
                widgets = _get_widgets_by_trait(fig, "description", param_name)
                for w in widgets:
                    w.set_state({'value': new_param_value})
                final_param_value_actual = \
                    pipe.get_mapper_params()[f"{step}__{param_name}"]
                final_param_value_expected = \
                    initial_param_value if clone_pipeline else new_param_value
                assert final_param_value_actual == final_param_value_expected
        else:
            initial_param_value = values["initial"]
            new_param_value = values["new"]
            widgets = _get_widgets_by_trait(fig, "description", step)
            for w in widgets:
                w.set_state({'value': new_param_value})
            final_param_value_actual = \
                pipe.get_mapper_params()[f"{step}"]
            final_param_value_expected = \
                initial_param_value if clone_pipeline else new_param_value
            assert final_param_value_actual == final_param_value_expected
Exemple #5
0
    def _runMapper(self):
        """
        creates mapper graphs based on train data

        :return: None
        """
        log.debug("--->creating mappers...")
        if not self.remake and os.path.exists(TEMP_DATA + "%s_firstsimplegap_graphs" % self.label):
            fgin = open(TEMP_DATA + "%s_firstsimplegap_graphs" % self.label, "rb")
            self.graphs = pickle.load(fgin)

            fpin = open(TEMP_DATA + "%s_mapper_pipes" % self.label, "rb")
            self.mapper_pipes = pickle.load(fpin)
            return

        clusterer = FirstSimpleGap()
        self.mapper_pipes = []

        log.debug("------> creating projection components...")

        for k in range(self.n_components):
            log.debug("---------> on component {}/{}...".format(k + 1, self.n_components))
            proj = Projection(columns=k)
            filter_func = Pipeline(steps=[('pca', self.rep), ('proj', proj)])
            filtered_data = filter_func.fit_transform(self.data)
            cover = OneDimensionalCover(n_intervals=self.n_intervals, overlap_frac=self.overlap_frac, kind='balanced')
            cover.fit(filtered_data)
            mapper_pipe = make_mapper_pipeline(scaler=None,
                                               filter_func=filter_func,
                                               cover=cover,
                                               clusterer=clusterer,
                                               verbose=(log.getEffectiveLevel() == logging.DEBUG),
                                               n_jobs=1)
            mapper_pipe.set_params(filter_func__proj__columns=k)
            self.mapper_pipes.append(("PCA%d" % (k + 1), mapper_pipe))

        # try parallelization
        log.debug("------> entering parallelization...")

        self.graphs = [mapper_pipe[1].fit_transform(self.data) for mapper_pipe in self.mapper_pipes]

        #
        # self.graphs = Parallel(n_jobs=5, prefer="threads")(
        #     delayed(mapper_pipe[1].fit_transform)(self.data) for mapper_pipe in self.mapper_pipes
        # )

        fg = open(TEMP_DATA + "%s_firstsimplegap_graphs" % self.label, "wb")
        pickle.dump(self.graphs, fg)
        fg.close()

        fp = open(TEMP_DATA + "%s_mapper_pipes" % self.label, "wb")
        pickle.dump(self.mapper_pipes, fp)
        fp.close()
    def test_cluster_sizes(self):
        """Verify that the total number of calculated clusters is equal to
        the number of displayed clusters."""
        pipe = make_mapper_pipeline(clusterer=FirstSimpleGap())
        fig = plot_static_mapper_graph(pipe, X_arr)
        node_trace = fig.data[1]

        node_sizes_vis = [_get_size_from_hovertext(ht) for ht in
                          node_trace.hovertext]

        g = pipe.fit_transform(X_arr)
        node_size_real = [len(node) for node in g.vs['node_elements']]

        assert sum(node_sizes_vis) == sum(node_size_real)
Exemple #7
0
    def test_cluster_sizes(self):
        """Verify that the total number of calculated clusters is equal to
        the number of displayed clusters."""
        pipe = make_mapper_pipeline(clusterer=FirstSimpleGap())
        warnings.simplefilter("ignore")
        fig = plot_interactive_mapper_graph(pipe, X)
        w_scatter = self._get_widget_by_trait(fig, 'data')

        node_sizes_vis = [self._get_size_from_hovertext(s_)
                          for s_ in w_scatter.get_state()
                          ['_data'][1]['hovertext']]

        g = pipe.fit_transform(X)
        node_size_real = [len(node)
                          for node in g['node_metadata']['node_elements']]

        assert sum(node_sizes_vis) == sum(node_size_real)