Example #1
0
class TestRegSpaceClustering(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        super(TestRegSpaceClustering, cls).setUpClass()
        np.random.seed(0)

    def setUp(self):
        self.dmin = 0.3
        self.clustering = RegularSpaceClustering(dmin=self.dmin)
        self.clustering.data_producer = RandomDataSource()

    def testAlgo(self):
        self.clustering.parametrize()

        # correct type of dtrajs
        assert types.is_int_array(self.clustering.dtrajs[0])

        # assert distance for each centroid is at least dmin
        for c in itertools.combinations(self.clustering.clustercenters, 2):
            if np.allclose(c[0], c[1]):  # skip equal pairs
                continue

            dist = np.linalg.norm(c[0] - c[1], 2)

            self.assertGreaterEqual(
                dist, self.dmin, "centroid pair\n%s\n%s\n has smaller"
                " distance than dmin(%f): %f" % (c[0], c[1], self.dmin, dist))

    def testAssignment(self):
        self.clustering.parametrize()

        assert len(self.clustering.clustercenters) > 1

        # num states == num _clustercenters?
        self.assertEqual(
            len(np.unique(self.clustering.dtrajs)),
            len(self.clustering.clustercenters),
            "number of unique states in dtrajs"
            " should be equal.")

        data_to_cluster = np.random.random((1000, 3))

        self.clustering.assign(data_to_cluster, stride=1)

    def testSpreadData(self):
        self.clustering.data_producer = RandomDataSource(a=-2, b=2)
        self.clustering.dmin = 2
        self.clustering.parametrize()

    def test1d_data(self):
        data = np.random.random(100)
        cluster_regspace(data, dmin=0.3)
Example #2
0
class TestRegSpaceClustering(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        super(TestRegSpaceClustering, cls).setUpClass()
        np.random.seed(0)

    def setUp(self):
        self.dmin = 0.3
        self.clustering = RegularSpaceClustering(dmin=self.dmin)
        self.clustering.data_producer = RandomDataSource()

    def testAlgo(self):
        self.clustering.parametrize()

        # correct type of dtrajs
        assert types.is_int_array(self.clustering.dtrajs[0])

        # assert distance for each centroid is at least dmin
        for c in itertools.combinations(self.clustering.clustercenters, 2):
            if np.allclose(c[0], c[1]):  # skip equal pairs
                continue

            dist = np.linalg.norm(c[0] - c[1], 2)

            self.assertGreaterEqual(dist, self.dmin,
                                    "centroid pair\n%s\n%s\n has smaller"
                                    " distance than dmin(%f): %f"
                                    % (c[0], c[1], self.dmin, dist))

    def testAssignment(self):
        self.clustering.parametrize()

        assert len(self.clustering.clustercenters) > 1

        # num states == num _clustercenters?
        self.assertEqual(len(np.unique(self.clustering.dtrajs)),  len(
            self.clustering.clustercenters), "number of unique states in dtrajs"
            " should be equal.")

        data_to_cluster = np.random.random((1000, 3))

        self.clustering.assign(data_to_cluster, stride=1)

    def testSpreadData(self):
        self.clustering.data_producer = RandomDataSource(a=-2, b=2)
        self.clustering.dmin = 2
        self.clustering.parametrize()

    def test1d_data(self):
        data = np.random.random(100)
        cluster_regspace(data, dmin=0.3)
Example #3
0
    def fit(self, data):
        """ performs clustering of data

        Parameters
        ----------
        data: np.ndarray
                array of data points to cluster
        merge: int
                minimal number of frames within each cluster. Smaller clusters are merged into next big one
        """
        # if n_clusters is given and no r, estimate n_clusters
        if self.radius is None:
            from htmd.clustering.kcenters import KCenter
            estClust = KCenter(n_clusters=self.n_clusters)
            estClust.fit(data)
            self.radius = estClust.distance.max()
            logger.info("Estimated radius = {}".format(self.radius))

        from pyemma.coordinates.clustering.regspace import RegularSpaceClustering
        self._reg = RegularSpaceClustering(dmin=self.radius)
        self.labels_ = self._reg.fit_transform(data).flatten()
Example #4
0
class TestRegSpaceClustering(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        super(TestRegSpaceClustering, cls).setUpClass()
        np.random.seed(0)

    def setUp(self):
        self.dmin = 0.3
        self.clustering = RegularSpaceClustering(dmin=self.dmin)
        self.clustering.data_producer = RandomDataSource()
        #self.pr = cProfile.Profile()
        #self.pr.enable()
        #print "*" * 80


    def tearDown(self):
        pass
#         from pstats import Stats
#         p = Stats(self.pr)
#         p.strip_dirs()
# 
#         p.sort_stats('cumtime')
#         p.print_stats()
# 
#         print "*" * 80

    def testAlgo(self):
        self.clustering.parametrize()

        assert self.clustering.dtrajs[0].dtype == int

        # assert distance for each centroid is at least dmin
        for c in itertools.combinations(self.clustering.clustercenters, 2):
            if np.allclose(c[0], c[1]):  # skip equal pairs
                continue

            dist = np.linalg.norm(c[0] - c[1], 2)

            self.assertGreaterEqual(dist, self.dmin, "centroid pair\n%s\n%s\n has smaller"
                                    " distance than dmin(%f): %f" % (c[0], c[1], self.dmin, dist))

    def testAssignment(self):
        self.clustering.parametrize()

        assert len(self.clustering.clustercenters) > 1

        # num states == num _clustercenters?
        self.assertEqual(len(np.unique(self.clustering.dtrajs)),  len(
            self.clustering.clustercenters), "number of unique states in dtrajs"
            " should be equal.")

    def testSpreadData(self):
        self.clustering.data_producer = RandomDataSource(a=-2, b=2)
        self.clustering.dmin = 2
        self.clustering.parametrize()
Example #5
0
    def fit(self, data):
        """ performs clustering of data

        Parameters
        ----------
        data: np.ndarray
                array of data points to cluster
        merge: int
                minimal number of frames within each cluster. Smaller clusters are merged into next big one
        """
        # if n_clusters is given and no r, estimate n_clusters
        if self.radius is None:
            estClust = KCenter(n_clusters=self.n_clusters)
            estClust.fit(data)
            self.radius = estClust.distance.max()
            logger.info("Estimated radius = {}".format(self.radius))

        from pyemma.coordinates.clustering.regspace import RegularSpaceClustering
        self._reg = RegularSpaceClustering(dmin=self.radius)
        self.labels_ = self._reg.fit_transform(data).flatten()
Example #6
0
class RegCluster(BaseEstimator, ClusterMixin, TransformerMixin):
    """ Class to perform regular clustering of a given data set

    RegCluster can be passed a radius or an approximate number of clusters. If a number of clusters is passed, KCenter
    clustering is used to estimate the necessary radius. RegCluster randomly chooses a point and assigns all points
    within the radius of this point to the same cluster. Then it proceeds with the nearest point, which is not yet
    assigned to a cluster and puts all unassigned points within the radius of this point in the next cluster and so on.

    Parameters
    ----------
    radius: float
        radius of clusters
    n_clusters: int
        desired number of clusters

    Examples
    --------
    >>> cluster = RegCluster(radius=5.1)
    >>> cluster.fit(data)

    Attributes
    ----------
    cluster_centers:  list
        list with the points, which are the centers of the clusters
    centerFrames : list
        list of indices of center points in data array
    labels_ : list
        list with number of cluster of each frame
    clusterSize_ : list
        list with number of frames in each cluster
    """
    def __init__(self, radius=None, n_clusters=None):
        if radius is None and n_clusters is None:
            raise RuntimeError("radius or n_clusters needs to be set")

        self.radius = radius
        self.n_clusters = n_clusters
        self.labels_ = []

    def fit(self, data):
        """ performs clustering of data

        Parameters
        ----------
        data: np.ndarray
                array of data points to cluster
        merge: int
                minimal number of frames within each cluster. Smaller clusters are merged into next big one
        """
        # if n_clusters is given and no r, estimate n_clusters
        if self.radius is None:
            from htmd.clustering.kcenters import KCenter
            estClust = KCenter(n_clusters=self.n_clusters)
            estClust.fit(data)
            self.radius = estClust.distance.max()
            logger.info("Estimated radius = {}".format(self.radius))

        from pyemma.coordinates.clustering.regspace import RegularSpaceClustering
        self._reg = RegularSpaceClustering(dmin=self.radius)
        self.labels_ = self._reg.fit_transform(data).flatten()

    @property
    def cluster_centers_(self):
        return self._reg.clustercenters

    @property
    def clusterSize(self):
        return np.bincount(self.labels_)
Example #7
0
class RegCluster(BaseEstimator, ClusterMixin, TransformerMixin):
    """ Class to perform regular clustering of a given data set

    RegCluster can be passed a radius or an approximate number of clusters. If a number of clusters is passed, KCenter
    clustering is used to estimate the necessary radius. RegCluster randomly chooses a point and assigns all points
    within the radius of this point to the same cluster. Then it proceeds with the nearest point, which is not yet
    assigned to a cluster and puts all unassigned points within the radius of this point in the next cluster and so on.

    Parameters
    ----------
    radius: float
        radius of clusters
    n_clusters: int
        desired number of clusters

    Examples
    --------
    >>> cluster = RegCluster(radius=5.1)
    >>> cluster.fit(data)

    Attributes
    ----------
    cluster_centers:  list
        list with the points, which are the centers of the clusters
    centerFrames : list
        list of indices of center points in data array
    labels_ : list
        list with number of cluster of each frame
    clusterSize_ : list
        list with number of frames in each cluster
    """
    def __init__(self, radius=None, n_clusters=None):
        if radius is None and n_clusters is None:
            raise RuntimeError("radius or n_clusters needs to be set")

        self.radius = radius
        self.n_clusters = n_clusters
        self.labels_ = []

    def fit(self, data):
        """ performs clustering of data

        Parameters
        ----------
        data: np.ndarray
                array of data points to cluster
        merge: int
                minimal number of frames within each cluster. Smaller clusters are merged into next big one
        """
        # if n_clusters is given and no r, estimate n_clusters
        if self.radius is None:
            from htmd.clustering.kcenters import KCenter
            estClust = KCenter(n_clusters=self.n_clusters)
            estClust.fit(data)
            self.radius = estClust.distance.max()
            logger.info("Estimated radius = {}".format(self.radius))

        from pyemma.coordinates.clustering.regspace import RegularSpaceClustering
        self._reg = RegularSpaceClustering(dmin=self.radius)
        self.labels_ = self._reg.fit_transform(data).flatten()

    @property
    def cluster_centers_(self):
        return self._reg.clustercenters

    @property
    def clusterSize(self):
        return np.bincount(self.labels_)
Example #8
0
 def setUp(self):
     self.dmin = 0.3
     self.clustering = RegularSpaceClustering(dmin=self.dmin)
     self.src = RandomDataSource()
Example #9
0
class TestRegSpaceClustering(unittest.TestCase):
    def setUp(self):
        self.dmin = 0.3
        self.clustering = RegularSpaceClustering(dmin=self.dmin)
        self.src = RandomDataSource()

    def test_algorithm(self):
        self.clustering.estimate(self.src)

        # correct type of dtrajs
        assert types.is_int_vector(self.clustering.dtrajs[0])

        # assert distance for each centroid is at least dmin
        for c in itertools.combinations(self.clustering.clustercenters, 2):
            if np.allclose(c[0], c[1]):  # skip equal pairs
                continue

            dist = np.linalg.norm(c[0] - c[1], 2)

            self.assertGreaterEqual(
                dist, self.dmin, "centroid pair\n%s\n%s\n has smaller"
                " distance than dmin(%f): %f" % (c[0], c[1], self.dmin, dist))

    def test_assignment(self):
        self.clustering.estimate(self.src)

        assert len(self.clustering.clustercenters) > 1

        # num states == num _clustercenters?
        self.assertEqual(
            len(np.unique(self.clustering.dtrajs)),
            len(self.clustering.clustercenters),
            "number of unique states in dtrajs"
            " should be equal.")

        data_to_cluster = np.random.random((1000, 3))

        self.clustering.assign(data_to_cluster, stride=1)

    def test_spread_data(self):
        src = RandomDataSource(a=-2, b=2)
        self.clustering.dmin = 2
        self.clustering.estimate(src)

    def test1d_data(self):
        data = np.random.random(100)
        cluster_regspace(data, dmin=0.3)

    def test_non_existent_metric(self):
        src = RandomDataSource(a=-2, b=2)
        self.clustering.dmin = 2
        self.clustering.metric = "non_existent_metric"
        with self.assertRaises(ValueError):
            self.clustering.estimate(src)

    def test_minRMSD_metric(self):
        src = RandomDataSource(a=-2, b=2)
        self.clustering.dmin = 2
        self.clustering.metric = "minRMSD"
        self.clustering.estimate(src)

        data_to_cluster = np.random.random((1000, 3))

        self.clustering.assign(data_to_cluster, stride=1)

    def test_too_small_dmin_should_warn(self):
        self.clustering.dmin = 1e-8
        max_centers = 50
        self.clustering.max_centers = max_centers
        import warnings
        with warnings.catch_warnings(record=True) as w:
            # Cause all warnings to always be triggered.
            warnings.simplefilter("always")
            # Trigger a warning.
            self.clustering.estimate(self.src)
            assert w
            assert len(w) == 1

            assert len(self.clustering.clustercenters) == max_centers

            # assign data
            out = self.clustering.get_output()
            assert len(out) == self.clustering.number_of_trajectories()
            assert len(out[0]) == self.clustering.trajectory_lengths()[0]
Example #10
0
 def setUp(self):
     self.dmin = 0.3
     self.clustering = RegularSpaceClustering(dmin=self.dmin)
     self.clustering.data_producer = RandomDataSource()
Example #11
0
 def setUp(self):
     self.dmin = 0.3
     self.clustering = RegularSpaceClustering(dmin=self.dmin)
     self.clustering.data_producer = RandomDataSource()