class TestRegSpaceClustering(unittest.TestCase): @classmethod def setUpClass(cls): super(TestRegSpaceClustering, cls).setUpClass() np.random.seed(0) def setUp(self): self.dmin = 0.3 self.clustering = RegularSpaceClustering(dmin=self.dmin) self.clustering.data_producer = RandomDataSource() def testAlgo(self): self.clustering.parametrize() # correct type of dtrajs assert types.is_int_array(self.clustering.dtrajs[0]) # assert distance for each centroid is at least dmin for c in itertools.combinations(self.clustering.clustercenters, 2): if np.allclose(c[0], c[1]): # skip equal pairs continue dist = np.linalg.norm(c[0] - c[1], 2) self.assertGreaterEqual( dist, self.dmin, "centroid pair\n%s\n%s\n has smaller" " distance than dmin(%f): %f" % (c[0], c[1], self.dmin, dist)) def testAssignment(self): self.clustering.parametrize() assert len(self.clustering.clustercenters) > 1 # num states == num _clustercenters? self.assertEqual( len(np.unique(self.clustering.dtrajs)), len(self.clustering.clustercenters), "number of unique states in dtrajs" " should be equal.") data_to_cluster = np.random.random((1000, 3)) self.clustering.assign(data_to_cluster, stride=1) def testSpreadData(self): self.clustering.data_producer = RandomDataSource(a=-2, b=2) self.clustering.dmin = 2 self.clustering.parametrize() def test1d_data(self): data = np.random.random(100) cluster_regspace(data, dmin=0.3)
class TestRegSpaceClustering(unittest.TestCase): @classmethod def setUpClass(cls): super(TestRegSpaceClustering, cls).setUpClass() np.random.seed(0) def setUp(self): self.dmin = 0.3 self.clustering = RegularSpaceClustering(dmin=self.dmin) self.clustering.data_producer = RandomDataSource() def testAlgo(self): self.clustering.parametrize() # correct type of dtrajs assert types.is_int_array(self.clustering.dtrajs[0]) # assert distance for each centroid is at least dmin for c in itertools.combinations(self.clustering.clustercenters, 2): if np.allclose(c[0], c[1]): # skip equal pairs continue dist = np.linalg.norm(c[0] - c[1], 2) self.assertGreaterEqual(dist, self.dmin, "centroid pair\n%s\n%s\n has smaller" " distance than dmin(%f): %f" % (c[0], c[1], self.dmin, dist)) def testAssignment(self): self.clustering.parametrize() assert len(self.clustering.clustercenters) > 1 # num states == num _clustercenters? self.assertEqual(len(np.unique(self.clustering.dtrajs)), len( self.clustering.clustercenters), "number of unique states in dtrajs" " should be equal.") data_to_cluster = np.random.random((1000, 3)) self.clustering.assign(data_to_cluster, stride=1) def testSpreadData(self): self.clustering.data_producer = RandomDataSource(a=-2, b=2) self.clustering.dmin = 2 self.clustering.parametrize() def test1d_data(self): data = np.random.random(100) cluster_regspace(data, dmin=0.3)
def fit(self, data): """ performs clustering of data Parameters ---------- data: np.ndarray array of data points to cluster merge: int minimal number of frames within each cluster. Smaller clusters are merged into next big one """ # if n_clusters is given and no r, estimate n_clusters if self.radius is None: from htmd.clustering.kcenters import KCenter estClust = KCenter(n_clusters=self.n_clusters) estClust.fit(data) self.radius = estClust.distance.max() logger.info("Estimated radius = {}".format(self.radius)) from pyemma.coordinates.clustering.regspace import RegularSpaceClustering self._reg = RegularSpaceClustering(dmin=self.radius) self.labels_ = self._reg.fit_transform(data).flatten()
class TestRegSpaceClustering(unittest.TestCase): @classmethod def setUpClass(cls): super(TestRegSpaceClustering, cls).setUpClass() np.random.seed(0) def setUp(self): self.dmin = 0.3 self.clustering = RegularSpaceClustering(dmin=self.dmin) self.clustering.data_producer = RandomDataSource() #self.pr = cProfile.Profile() #self.pr.enable() #print "*" * 80 def tearDown(self): pass # from pstats import Stats # p = Stats(self.pr) # p.strip_dirs() # # p.sort_stats('cumtime') # p.print_stats() # # print "*" * 80 def testAlgo(self): self.clustering.parametrize() assert self.clustering.dtrajs[0].dtype == int # assert distance for each centroid is at least dmin for c in itertools.combinations(self.clustering.clustercenters, 2): if np.allclose(c[0], c[1]): # skip equal pairs continue dist = np.linalg.norm(c[0] - c[1], 2) self.assertGreaterEqual(dist, self.dmin, "centroid pair\n%s\n%s\n has smaller" " distance than dmin(%f): %f" % (c[0], c[1], self.dmin, dist)) def testAssignment(self): self.clustering.parametrize() assert len(self.clustering.clustercenters) > 1 # num states == num _clustercenters? self.assertEqual(len(np.unique(self.clustering.dtrajs)), len( self.clustering.clustercenters), "number of unique states in dtrajs" " should be equal.") def testSpreadData(self): self.clustering.data_producer = RandomDataSource(a=-2, b=2) self.clustering.dmin = 2 self.clustering.parametrize()
def fit(self, data): """ performs clustering of data Parameters ---------- data: np.ndarray array of data points to cluster merge: int minimal number of frames within each cluster. Smaller clusters are merged into next big one """ # if n_clusters is given and no r, estimate n_clusters if self.radius is None: estClust = KCenter(n_clusters=self.n_clusters) estClust.fit(data) self.radius = estClust.distance.max() logger.info("Estimated radius = {}".format(self.radius)) from pyemma.coordinates.clustering.regspace import RegularSpaceClustering self._reg = RegularSpaceClustering(dmin=self.radius) self.labels_ = self._reg.fit_transform(data).flatten()
class RegCluster(BaseEstimator, ClusterMixin, TransformerMixin): """ Class to perform regular clustering of a given data set RegCluster can be passed a radius or an approximate number of clusters. If a number of clusters is passed, KCenter clustering is used to estimate the necessary radius. RegCluster randomly chooses a point and assigns all points within the radius of this point to the same cluster. Then it proceeds with the nearest point, which is not yet assigned to a cluster and puts all unassigned points within the radius of this point in the next cluster and so on. Parameters ---------- radius: float radius of clusters n_clusters: int desired number of clusters Examples -------- >>> cluster = RegCluster(radius=5.1) >>> cluster.fit(data) Attributes ---------- cluster_centers: list list with the points, which are the centers of the clusters centerFrames : list list of indices of center points in data array labels_ : list list with number of cluster of each frame clusterSize_ : list list with number of frames in each cluster """ def __init__(self, radius=None, n_clusters=None): if radius is None and n_clusters is None: raise RuntimeError("radius or n_clusters needs to be set") self.radius = radius self.n_clusters = n_clusters self.labels_ = [] def fit(self, data): """ performs clustering of data Parameters ---------- data: np.ndarray array of data points to cluster merge: int minimal number of frames within each cluster. Smaller clusters are merged into next big one """ # if n_clusters is given and no r, estimate n_clusters if self.radius is None: from htmd.clustering.kcenters import KCenter estClust = KCenter(n_clusters=self.n_clusters) estClust.fit(data) self.radius = estClust.distance.max() logger.info("Estimated radius = {}".format(self.radius)) from pyemma.coordinates.clustering.regspace import RegularSpaceClustering self._reg = RegularSpaceClustering(dmin=self.radius) self.labels_ = self._reg.fit_transform(data).flatten() @property def cluster_centers_(self): return self._reg.clustercenters @property def clusterSize(self): return np.bincount(self.labels_)
def setUp(self): self.dmin = 0.3 self.clustering = RegularSpaceClustering(dmin=self.dmin) self.src = RandomDataSource()
class TestRegSpaceClustering(unittest.TestCase): def setUp(self): self.dmin = 0.3 self.clustering = RegularSpaceClustering(dmin=self.dmin) self.src = RandomDataSource() def test_algorithm(self): self.clustering.estimate(self.src) # correct type of dtrajs assert types.is_int_vector(self.clustering.dtrajs[0]) # assert distance for each centroid is at least dmin for c in itertools.combinations(self.clustering.clustercenters, 2): if np.allclose(c[0], c[1]): # skip equal pairs continue dist = np.linalg.norm(c[0] - c[1], 2) self.assertGreaterEqual( dist, self.dmin, "centroid pair\n%s\n%s\n has smaller" " distance than dmin(%f): %f" % (c[0], c[1], self.dmin, dist)) def test_assignment(self): self.clustering.estimate(self.src) assert len(self.clustering.clustercenters) > 1 # num states == num _clustercenters? self.assertEqual( len(np.unique(self.clustering.dtrajs)), len(self.clustering.clustercenters), "number of unique states in dtrajs" " should be equal.") data_to_cluster = np.random.random((1000, 3)) self.clustering.assign(data_to_cluster, stride=1) def test_spread_data(self): src = RandomDataSource(a=-2, b=2) self.clustering.dmin = 2 self.clustering.estimate(src) def test1d_data(self): data = np.random.random(100) cluster_regspace(data, dmin=0.3) def test_non_existent_metric(self): src = RandomDataSource(a=-2, b=2) self.clustering.dmin = 2 self.clustering.metric = "non_existent_metric" with self.assertRaises(ValueError): self.clustering.estimate(src) def test_minRMSD_metric(self): src = RandomDataSource(a=-2, b=2) self.clustering.dmin = 2 self.clustering.metric = "minRMSD" self.clustering.estimate(src) data_to_cluster = np.random.random((1000, 3)) self.clustering.assign(data_to_cluster, stride=1) def test_too_small_dmin_should_warn(self): self.clustering.dmin = 1e-8 max_centers = 50 self.clustering.max_centers = max_centers import warnings with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") # Trigger a warning. self.clustering.estimate(self.src) assert w assert len(w) == 1 assert len(self.clustering.clustercenters) == max_centers # assign data out = self.clustering.get_output() assert len(out) == self.clustering.number_of_trajectories() assert len(out[0]) == self.clustering.trajectory_lengths()[0]
def setUp(self): self.dmin = 0.3 self.clustering = RegularSpaceClustering(dmin=self.dmin) self.clustering.data_producer = RandomDataSource()