def test_regspace_nthreads(self): self.clustering.fit(self.src, n_jobs=1) cl2 = RegularSpace(dmin=self.dmin, n_jobs=2).fit(self.src).fetch_model() centers1 = self.clustering.fetch_model().cluster_centers centers2 = cl2.cluster_centers np.testing.assert_equal(centers1, centers2)
def test_one_cluster_edgecase(algo, dim): data = np.random.uniform(size=(2, dim)) clustering = None if algo == 'kmeans': clustering = KMeans(n_clusters=1).fit(data).fetch_model() elif algo == 'regspace': clustering = RegularSpace(dmin=10, max_centers=1).fit(data).fetch_model() elif algo == 'minibatch-kmeans': clustering = MiniBatchKMeans(n_clusters=1).fit(data).fetch_model() else: pytest.fail() assert clustering is not None assert_equal(clustering.n_clusters, 1) assert_equal(clustering.dim, dim) dtraj = clustering.transform(data) assert_equal(len(dtraj), 2) assert_equal(dtraj[0], 0) assert_equal(dtraj[1], 0)
def test_properties(self): est = RegularSpace(dmin=1e-8, max_centers=500, metric='euclidean', n_jobs=5) np.testing.assert_equal(est.dmin, 1e-8) np.testing.assert_equal(est.max_centers, 500) np.testing.assert_equal(est.n_clusters, 500) est.n_clusters = 30 np.testing.assert_equal(est.max_centers, 30) # n_clusters and max_centers are aliases np.testing.assert_equal(est.metric, 'euclidean') np.testing.assert_equal(est.n_jobs, 5) with np.testing.assert_raises(ValueError): est.dmin = -.5 # negative, invalid! with np.testing.assert_raises(ValueError): est.metric = 'bogus' with np.testing.assert_raises(ValueError): est.max_centers = 0 # must be positive
def setUp(self): self.dmin = 0.3 self.clustering = RegularSpace(dmin=self.dmin) self.src = np.random.uniform(size=(1000, 3))
class TestRegSpaceClustering(unittest.TestCase): def setUp(self): self.dmin = 0.3 self.clustering = RegularSpace(dmin=self.dmin) self.src = np.random.uniform(size=(1000, 3)) def test_algorithm(self): model = self.clustering.fit(self.src).fetch_model() # assert distance for each centroid is at least dmin for c in itertools.combinations(model.cluster_centers, 2): if np.allclose(c[0], c[1]): # skip equal pairs continue dist = np.linalg.norm(c[0] - c[1], 2) self.assertGreaterEqual( dist, self.dmin, "centroid pair\n%s\n%s\n has smaller" " distance than dmin(%f): %f" % (c[0], c[1], self.dmin, dist)) def test_assignment(self): model = self.clustering.fit(self.src).fetch_model() assert len(model.cluster_centers) > 1 dtraj = model.transform(self.src) # num states == num _clustercenters? self.assertEqual( len(np.unique(dtraj)), len(model.cluster_centers), "number of unique states in dtrajs" " should be equal.") data_to_cluster = np.random.random((1000, 3)) model.transform(data_to_cluster) def test_spread_data(self): src = np.random.uniform(-2, 2, size=(1000, 3)) self.clustering.dmin = 2 self.clustering.fit(src) def test1d_data(self): data = np.random.random(100) RegularSpace(dmin=0.3).fit(data) def test_non_existent_metric(self): with self.assertRaises(ValueError): self.clustering.metric = "non_existent_metric" def test_too_small_dmin_should_warn(self): self.clustering.dmin = 1e-8 max_centers = 50 self.clustering.max_centers = max_centers import warnings with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") # Trigger a warning. self.clustering.fit(self.src) assert w assert len(w) == 1 model = self.clustering.fetch_model() assert len(model.cluster_centers) == max_centers def test_regspace_nthreads(self): self.clustering.fit(self.src, n_jobs=1) cl2 = RegularSpace(dmin=self.dmin, n_jobs=2).fit(self.src).fetch_model() centers1 = self.clustering.fetch_model().cluster_centers centers2 = cl2.cluster_centers np.testing.assert_equal(centers1, centers2) def test_properties(self): est = RegularSpace(dmin=1e-8, max_centers=500, metric='euclidean', n_jobs=5) np.testing.assert_equal(est.dmin, 1e-8) np.testing.assert_equal(est.max_centers, 500) np.testing.assert_equal(est.n_clusters, 500) est.n_clusters = 30 np.testing.assert_equal(est.max_centers, 30) # n_clusters and max_centers are aliases np.testing.assert_equal(est.metric, 'euclidean') np.testing.assert_equal(est.n_jobs, 5) with np.testing.assert_raises(ValueError): est.dmin = -.5 # negative, invalid! with np.testing.assert_raises(ValueError): est.metric = 'bogus' with np.testing.assert_raises(ValueError): est.max_centers = 0 # must be positive
def test1d_data(self): data = np.random.random(100) RegularSpace(dmin=0.3).fit(data)
def _estimate(self, iterable, **kwargs): ######## # Calculate clustercenters: # 1. choose first datapoint as centroid # 2. for all X: calc distances to all clustercenters # 3. add new centroid, if min(distance to all other clustercenters) >= dmin ######## # temporary list to store cluster centers used_frames = 0 regspace = RegularSpace(dmin=self.dmin, max_centers=self.max_centers, metric=self.metric, n_jobs=self.n_jobs) it = iterable.iterator(return_trajindex=False, stride=self.stride, chunk=self.chunksize, skip=self.skip) try: with it: for X in it: regspace.partial_fit(X.astype(np.float32, order='C', copy=False), n_jobs=self.n_jobs) used_frames += len(X) self._converged = True except Exception as e: if 'MaxCentersReachedException' in e.__class__.__name__: self._converged = False msg = 'Maximum number of cluster centers reached.' \ ' Consider increasing max_centers or choose' \ ' a larger minimum distance, dmin.' self.logger.warning(msg) warnings.warn(msg) # pass amount of processed data used_data = used_frames / float(it.n_frames_total()) * 100.0 raise NotConvergedWarning("Used data for centers: %.2f%%" % used_data) else: # todo ugly workaround until maxcentersreached is placed not within metric subpackage but globally # somewhere raise finally: # even if not converged, we store the found centers. model = regspace.fetch_model() clustercenters = model.cluster_centers.squeeze().reshape( -1, iterable.ndim) self._inst = ClusterModel(clustercenters, metric=self.metric) from types import MethodType def _assign(self, data, _, n_jobs): out = self.transform(data, n_jobs=n_jobs) return out self._inst.assign = MethodType(_assign, self._inst) self.update_model_params(clustercenters=clustercenters, n_clusters=len(clustercenters)) if len(clustercenters) == 1: self.logger.warning('Have found only one center according to ' 'minimum distance requirement of %f' % self.dmin) return self
KMeans( n_clusters=100, # place 100 cluster centers init_strategy='uniform', # uniform initialization strategy fixed_seed=13, n_jobs=8)), ( 'k-Means with k-means++ initialization', KMeans( n_clusters=100, # place 100 cluster centers init_strategy='kmeans++', # uniform initialization strategy fixed_seed=13, n_jobs=8)), ( 'Regular space clustering', RegularSpace( dmin=3, # minimum distance between cluster centers max_centers=300, # maximum number of cluster centers n_jobs=8)) ] f, axes = plt.subplots(3, 2, figsize=(15, 15)) for i, (label, estimator) in enumerate(estimators): clustering = estimator.fit(samples).fetch_model() ax1 = axes[i][0] ax2 = axes[i][1] ax1.hexbin(*samples.T, bins='log') ax1.scatter(*clustering.cluster_centers.T, marker='o', c='m') ax1.axis('off') ax1.set_title(label + ': cluster centers')