Exemple #1
0
 def test_regspace_nthreads(self):
     self.clustering.fit(self.src, n_jobs=1)
     cl2 = RegularSpace(dmin=self.dmin,
                        n_jobs=2).fit(self.src).fetch_model()
     centers1 = self.clustering.fetch_model().cluster_centers
     centers2 = cl2.cluster_centers
     np.testing.assert_equal(centers1, centers2)
Exemple #2
0
def test_one_cluster_edgecase(algo, dim):
    data = np.random.uniform(size=(2, dim))
    clustering = None
    if algo == 'kmeans':
        clustering = KMeans(n_clusters=1).fit(data).fetch_model()
    elif algo == 'regspace':
        clustering = RegularSpace(dmin=10, max_centers=1).fit(data).fetch_model()
    elif algo == 'minibatch-kmeans':
        clustering = MiniBatchKMeans(n_clusters=1).fit(data).fetch_model()
    else:
        pytest.fail()
    assert clustering is not None
    assert_equal(clustering.n_clusters, 1)
    assert_equal(clustering.dim, dim)
    dtraj = clustering.transform(data)
    assert_equal(len(dtraj), 2)
    assert_equal(dtraj[0], 0)
    assert_equal(dtraj[1], 0)
Exemple #3
0
    def test_properties(self):
        est = RegularSpace(dmin=1e-8,
                           max_centers=500,
                           metric='euclidean',
                           n_jobs=5)
        np.testing.assert_equal(est.dmin, 1e-8)
        np.testing.assert_equal(est.max_centers, 500)
        np.testing.assert_equal(est.n_clusters, 500)
        est.n_clusters = 30
        np.testing.assert_equal(est.max_centers,
                                30)  # n_clusters and max_centers are aliases
        np.testing.assert_equal(est.metric, 'euclidean')
        np.testing.assert_equal(est.n_jobs, 5)

        with np.testing.assert_raises(ValueError):
            est.dmin = -.5  # negative, invalid!

        with np.testing.assert_raises(ValueError):
            est.metric = 'bogus'

        with np.testing.assert_raises(ValueError):
            est.max_centers = 0  # must be positive
Exemple #4
0
 def setUp(self):
     self.dmin = 0.3
     self.clustering = RegularSpace(dmin=self.dmin)
     self.src = np.random.uniform(size=(1000, 3))
Exemple #5
0
class TestRegSpaceClustering(unittest.TestCase):
    def setUp(self):
        self.dmin = 0.3
        self.clustering = RegularSpace(dmin=self.dmin)
        self.src = np.random.uniform(size=(1000, 3))

    def test_algorithm(self):
        model = self.clustering.fit(self.src).fetch_model()

        # assert distance for each centroid is at least dmin
        for c in itertools.combinations(model.cluster_centers, 2):
            if np.allclose(c[0], c[1]):  # skip equal pairs
                continue

            dist = np.linalg.norm(c[0] - c[1], 2)

            self.assertGreaterEqual(
                dist, self.dmin, "centroid pair\n%s\n%s\n has smaller"
                " distance than dmin(%f): %f" % (c[0], c[1], self.dmin, dist))

    def test_assignment(self):
        model = self.clustering.fit(self.src).fetch_model()

        assert len(model.cluster_centers) > 1
        dtraj = model.transform(self.src)

        # num states == num _clustercenters?
        self.assertEqual(
            len(np.unique(dtraj)), len(model.cluster_centers),
            "number of unique states in dtrajs"
            " should be equal.")

        data_to_cluster = np.random.random((1000, 3))
        model.transform(data_to_cluster)

    def test_spread_data(self):
        src = np.random.uniform(-2, 2, size=(1000, 3))
        self.clustering.dmin = 2
        self.clustering.fit(src)

    def test1d_data(self):
        data = np.random.random(100)
        RegularSpace(dmin=0.3).fit(data)

    def test_non_existent_metric(self):
        with self.assertRaises(ValueError):
            self.clustering.metric = "non_existent_metric"

    def test_too_small_dmin_should_warn(self):
        self.clustering.dmin = 1e-8
        max_centers = 50
        self.clustering.max_centers = max_centers
        import warnings
        with warnings.catch_warnings(record=True) as w:
            # Cause all warnings to always be triggered.
            warnings.simplefilter("always")
            # Trigger a warning.
            self.clustering.fit(self.src)
            assert w
            assert len(w) == 1
            model = self.clustering.fetch_model()
            assert len(model.cluster_centers) == max_centers

    def test_regspace_nthreads(self):
        self.clustering.fit(self.src, n_jobs=1)
        cl2 = RegularSpace(dmin=self.dmin,
                           n_jobs=2).fit(self.src).fetch_model()
        centers1 = self.clustering.fetch_model().cluster_centers
        centers2 = cl2.cluster_centers
        np.testing.assert_equal(centers1, centers2)

    def test_properties(self):
        est = RegularSpace(dmin=1e-8,
                           max_centers=500,
                           metric='euclidean',
                           n_jobs=5)
        np.testing.assert_equal(est.dmin, 1e-8)
        np.testing.assert_equal(est.max_centers, 500)
        np.testing.assert_equal(est.n_clusters, 500)
        est.n_clusters = 30
        np.testing.assert_equal(est.max_centers,
                                30)  # n_clusters and max_centers are aliases
        np.testing.assert_equal(est.metric, 'euclidean')
        np.testing.assert_equal(est.n_jobs, 5)

        with np.testing.assert_raises(ValueError):
            est.dmin = -.5  # negative, invalid!

        with np.testing.assert_raises(ValueError):
            est.metric = 'bogus'

        with np.testing.assert_raises(ValueError):
            est.max_centers = 0  # must be positive
Exemple #6
0
 def test1d_data(self):
     data = np.random.random(100)
     RegularSpace(dmin=0.3).fit(data)
Exemple #7
0
    def _estimate(self, iterable, **kwargs):
        ########
        # Calculate clustercenters:
        # 1. choose first datapoint as centroid
        # 2. for all X: calc distances to all clustercenters
        # 3. add new centroid, if min(distance to all other clustercenters) >= dmin
        ########
        # temporary list to store cluster centers
        used_frames = 0
        regspace = RegularSpace(dmin=self.dmin,
                                max_centers=self.max_centers,
                                metric=self.metric,
                                n_jobs=self.n_jobs)
        it = iterable.iterator(return_trajindex=False,
                               stride=self.stride,
                               chunk=self.chunksize,
                               skip=self.skip)
        try:
            with it:
                for X in it:
                    regspace.partial_fit(X.astype(np.float32,
                                                  order='C',
                                                  copy=False),
                                         n_jobs=self.n_jobs)
                    used_frames += len(X)
            self._converged = True
        except Exception as e:
            if 'MaxCentersReachedException' in e.__class__.__name__:
                self._converged = False
                msg = 'Maximum number of cluster centers reached.' \
                      ' Consider increasing max_centers or choose' \
                      ' a larger minimum distance, dmin.'
                self.logger.warning(msg)
                warnings.warn(msg)
                # pass amount of processed data
                used_data = used_frames / float(it.n_frames_total()) * 100.0
                raise NotConvergedWarning("Used data for centers: %.2f%%" %
                                          used_data)
            else:
                # todo ugly workaround until maxcentersreached is placed not within metric subpackage but globally
                #  somewhere
                raise
        finally:
            # even if not converged, we store the found centers.
            model = regspace.fetch_model()
            clustercenters = model.cluster_centers.squeeze().reshape(
                -1, iterable.ndim)
            self._inst = ClusterModel(clustercenters, metric=self.metric)
            from types import MethodType

            def _assign(self, data, _, n_jobs):
                out = self.transform(data, n_jobs=n_jobs)
                return out

            self._inst.assign = MethodType(_assign, self._inst)
            self.update_model_params(clustercenters=clustercenters,
                                     n_clusters=len(clustercenters))

            if len(clustercenters) == 1:
                self.logger.warning('Have found only one center according to '
                                    'minimum distance requirement of %f' %
                                    self.dmin)

        return self
Exemple #8
0
        KMeans(
            n_clusters=100,  # place 100 cluster centers
            init_strategy='uniform',  # uniform initialization strategy
            fixed_seed=13,
            n_jobs=8)),
    (
        'k-Means with k-means++ initialization',
        KMeans(
            n_clusters=100,  # place 100 cluster centers
            init_strategy='kmeans++',  # uniform initialization strategy
            fixed_seed=13,
            n_jobs=8)),
    (
        'Regular space clustering',
        RegularSpace(
            dmin=3,  # minimum distance between cluster centers
            max_centers=300,  # maximum number of cluster centers
            n_jobs=8))
]

f, axes = plt.subplots(3, 2, figsize=(15, 15))

for i, (label, estimator) in enumerate(estimators):
    clustering = estimator.fit(samples).fetch_model()
    ax1 = axes[i][0]
    ax2 = axes[i][1]

    ax1.hexbin(*samples.T, bins='log')
    ax1.scatter(*clustering.cluster_centers.T, marker='o', c='m')
    ax1.axis('off')
    ax1.set_title(label + ': cluster centers')