def test_simple(self): config = Config() self.assertEqual('kmeans', config['method']) self.assertEqual({'k': 3, 'seed': 0}, config['parameter']) self.assertEqual('simple', config['compressor_method']) self.assertEqual({'bucket_size': 100}, config.get('compressor_parameter')) self.assertEqual('euclidean', config.get('distance'))
def _test_func_with_legal_and_illegal_config(self, func): dataset = self._make_stub_dataset() # test illegal method config = Config(method='dbscan', compressor_parameter={"bucket_size": 5}) clustering = self._make_stub_clustering(config, dataset) self.assertRaises(RuntimeError, lambda: func(clustering, dataset)) clustering.stop() # test legal method config = Config(method='kmeans', compressor_parameter={"bucket_size": 5}) clustering = self._make_stub_clustering(config, dataset) func(clustering, dataset) clustering.stop()
def test_get_core_members(self): dataset = self._make_stub_dataset() config = Config(method='kmeans', compressor_parameter={"bucket_size": 5}) clustering = self._make_stub_clustering(config, dataset) clustering.get_core_members(light=False) clustering.get_core_members(light=True) clustering.stop()
def test_method_params(self): self.assertTrue('k' in Config(method='kmeans')['parameter']) self.assertTrue('seed' in Config(method='kmeans')['parameter']) self.assertTrue('k' in Config(method='gmm')['parameter']) self.assertTrue('seed' in Config(method='gmm')['parameter']) self.assertTrue('eps' in Config(method='dbscan')['parameter']) self.assertTrue('min_core_point' in Config( method='dbscan')['parameter'])
def test_compressor_params(self): self.assertTrue('bucket_size' in Config( compressor_method='simple')['compressor_parameter']) self.assertTrue('bucket_size' in Config( compressor_method='compressive')['compressor_parameter']) self.assertTrue('bucket_length' in Config( compressor_method='compressive')['compressor_parameter']) self.assertTrue('compressed_bucket_size' in Config( compressor_method='compressive')['compressor_parameter']) self.assertTrue('bicriteria_base_size' in Config( compressor_method='compressive')['compressor_parameter']) self.assertTrue('forgetting_factor' in Config( compressor_method='compressive')['compressor_parameter']) self.assertTrue('seed' in Config( compressor_method='compressive')['compressor_parameter']) config = Config(compressor_method='simple', compressor_parameter={'bucket_size': 10}) self.assertEqual(10, config['compressor_parameter']['bucket_size'])
def test_default(self): config = Config.default() self.assertEqual('kmeans', config['method']) self.assertEqual('simple', config['compressor_method']) self.assertEqual('euclidean', config['distance'])
def test_compressor_methods(self): config = Config() self.assertTrue(isinstance(config.compressor_methods(), list))
def test_distances(self): config = Config() self.assertTrue(isinstance(config.distances(), list))
def test_methods(self): config = Config() self.assertTrue(isinstance(config.methods(), list))
def test_get_revision(self): clustering = Clustering.run(Config()) self.assertEqual(0, clustering.get_revision()) clustering.stop()
def test_push(self): clustering = Clustering.run(Config()) dataset = self._make_stub_dataset() for (idx, row_id, result) in clustering.push(dataset): self.assertEqual(result, True) clustering.stop()
def test_embedded(self): clustering = Clustering.run(Config(), embedded=True) clustering.stop()
from jubakit.clustering import Clustering, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load a CSV file. loader = CSVLoader('blobs.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'cluster': Schema.ID, }, Schema.NUMBER) # Create a Dataset. dataset = Dataset(loader, schema) # Create an Clustering Service. cfg = Config(method='kmeans') clustering = Clustering.run(cfg) # Update the Clustering model. for (idx, row_id, result) in clustering.push(dataset): pass # Get clusters clusters = clustering.get_core_members(light=False) # Get centers of each cluster centers = clustering.get_k_center() # Calculate SSE: sum of squared errors sse = 0.0 for cluster, center in zip(clusters, centers): # Center of clusters