def setUp(self): Cluster.clusterIdCounter = 0 self.docx = Document(1, {1: 2, 2: 4}) self.docy = Document(2, {2: 4}) self.cluster1 = Cluster(self.docx) self.cluster2 = Cluster(self.docy) self.doc1 = Document(3, Vector({3: 4})) self.doc2 = Document(4, Vector({2: 4}))
def setUp(self): Cluster.clusterIdCounter = 0 self.docx = Document(1, {1:2,2:4}) self.docy = Document(2, {2:4}) self.cluster1 = Cluster(self.docx) self.cluster2 = Cluster(self.docy) self.doc1 = Document(3, Vector({3:4})) self.doc2 = Document(4, Vector({2:4}))
def getClusterAndUpdateExistingClusters(self, document): predictedCluster = self.getClusterForDocument(document) if predictedCluster!=None: self.clusters[predictedCluster].addDocument(document) else: newCluster = Cluster(document) newCluster.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: permutation.addDocument(newCluster) self.clusters[newCluster.clusterId] = newCluster
def json_process(json_out): data = json_out["frame"] Xs = [i['x'] for i in data] Ys = [i['y'] for i in data] cluster_in = np.column_stack((Xs, Ys)) cluster = Cluster(cluster_in, eps=0.35, min_samples=3) cluster.plot(fig=plt) plt.xlim(-10, 10) plt.ylim(-0.9, 18) plt.pause(0.00000001) plt.clf()
def process_data(data): Xs = [i['x'] for i in data] Ys = [i['y'] for i in data] cluster_in = np.column_stack((Xs, Ys)) cluster = Cluster(cluster_in, eps=0.35, min_samples=3) cluster.plot(fig=plt) plt.xlim(-10, 10) plt.ylim(-0.9, 18) plt.pause(0.00000001) plt.clf()
def getClusterAndUpdateExistingClusters(self, document): predictedCluster = self.getClusterForDocument(document) if predictedCluster != None: self.clusters[predictedCluster].addDocument(document) else: newCluster = Cluster(document) newCluster.setSignatureUsingVectorPermutations( self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap) for permutation in self.signaturePermutations: permutation.addDocument(newCluster) self.clusters[newCluster.clusterId] = newCluster
def json_process(json_out): data = json_out["frame"] Xs = [i['x'] for i in data] Ys = [i['y'] for i in data] cluster_in = np.column_stack((Xs, Ys)) cluster = Cluster(cluster_in, eps=0.35, min_samples=3) cluster.plot(fig=plt) # plt.scatter(Xs, Ys) plt.xlim(-10, 10) plt.ylim(-0.9, 18) # plt.set_xbound(lower=xmin, upper=xmax) # plt.set_ybound(lower=ymin, upper=ymax) plt.pause(0.00000001) plt.clf()
def inspect_cluster_size(clustered_path: str, sense_inventory_path: str, fig_path: str): from classes import Cluster # cluster_dict_original: Dict[str, List[Cluster]] = pickle.load(open(sense_inventory_path, "rb")) # clusters_count_original: int = sum([len(y.cluster_items) for x in cluster_dict_original.values() for y in x]) cluster_dict_raw: Dict[str, List[Cluster]] = pickle.load( open(clustered_path, "rb")) largest_cluster: Cluster = Cluster([]) for lemma in tqdm(cluster_dict_raw): for cluster in cluster_dict_raw[lemma]: if len(cluster.cluster_items) > len(largest_cluster.cluster_items): largest_cluster = cluster print( f"Max cluster size for {largest_cluster.get_dominant_lemma()}: {len(largest_cluster.cluster_items)}" ) merged_clusters: List[Cluster] = [ y for x in cluster_dict_raw.values() for y in x if len(y.cluster_items) > 1 ] merged_clusters.sort(key=lambda x: len(x.cluster_items), reverse=True) merged_cluster_items_count: int = sum( [len(x.cluster_items) for x in merged_clusters]) x_data: List[str] = list(range(len(merged_clusters) - 1)) y_data: List[int] = [len(x.cluster_items) for x in merged_clusters[1:]] pyplot.scatter(x_data, y_data) pyplot.xlabel("Cluster ID") pyplot.ylabel("Number of contained usage contexts") pyplot.title("Distribution of cluster size") pyplot.savefig(fig_path, dpi=600) pyplot.show()
def generate_cluster_objects(scooter_data: pd.DataFrame, cluster_labels: list) -> [Cluster]: """ Based on cluster labels and scooter data create Scooter and Cluster objects. Cluster class generates cluster center :param scooter_data: geospatial data for scooters :param cluster_labels: list of labels for scooter data :return: list of clusters """ # Add cluster labels as a row to the scooter data dataframe scooter_data_w_labels = scooter_data.copy() scooter_data_w_labels["cluster_labels"] = cluster_labels # Generate series of scooters belonging to each cluster clusters = [] for cluster_label in np.unique(cluster_labels): # Filter out scooters within cluster cluster_scooters = scooter_data_w_labels[ scooter_data_w_labels["cluster_labels"] == cluster_label] # Generate scooter objets, using index as ID scooters = [ Scooter(row["lat"], row["lon"], row["battery"], index) for index, row in cluster_scooters.iterrows() ] # Adding all scooters to cluster to find center location clusters.append(Cluster(cluster_label, scooters)) return sorted(clusters, key=lambda cluster: cluster.id)
def test_iterateByAttribute(self): self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2) self.assertEqual([(self.cluster1, 'cluster_0'), (self.cluster2, 'cluster_1')], list( Cluster.iterateByAttribute( [self.cluster1, self.cluster2], 'clusterId')))
def test_filterClustersByAttribute(self): self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2) self.assertEqual([self.cluster1, self.cluster2], list( Cluster.getClustersByAttributeAndThreshold( [self.cluster1, self.cluster2], 'vectorWeights', 1))) self.assertEqual([], list( Cluster.getClustersByAttributeAndThreshold( [self.cluster1, self.cluster2], 'vectorWeights', 3))) self.assertEqual([self.cluster1, self.cluster2], list( Cluster.getClustersByAttributeAndThreshold( [self.cluster1, self.cluster2], 'vectorWeights', 3, Cluster.BELOW_THRESHOLD)))
def create_clustering_obj(hit_id, items_map, clustering_list): """Creates a clustering object for a clustering Args: hit_id (int): Hit # items_map (dict of tuple to Item): Maps items from occurances in hits to respective Item class object clustering_list (list(int)): A clustering of items (represented by indices of their occuance in hit) Returns: Clustering: Object of class clustering """ clustering_obj = Clustering() for cluster in clustering_list: cluster_obj = Cluster() for item_id in cluster: cluster_obj.add_item(items_map[(hit_id, item_id)]) clustering_obj.add_cluster(cluster_obj) return clustering_obj
class ClusterTests(unittest.TestCase): def setUp(self): Cluster.clusterIdCounter = 0 self.docx = Document(1, {1: 2, 2: 4}) self.docy = Document(2, {2: 4}) self.cluster1 = Cluster(self.docx) self.cluster2 = Cluster(self.docy) self.doc1 = Document(3, Vector({3: 4})) self.doc2 = Document(4, Vector({2: 4})) def test_initialization(self): self.assertEqual('cluster_0', self.cluster1.clusterId) self.assertEqual('cluster_1', self.cluster2.clusterId) self.assertEqual(2, Cluster.clusterIdCounter) self.assertEqual([self.docx], list(self.cluster1.iterateDocumentsInCluster())) self.assertEqual([self.docy], list(self.cluster2.iterateDocumentsInCluster())) def test_addDocument(self): self.cluster1.addDocument(self.doc1) # Test if cluster id is set. self.assertEqual(self.cluster1.clusterId, self.doc1.clusterId) # Test that cluster mean is updated. self.assertEqual({1: 2 / 2., 2: 2., 3: 2.}, self.cluster1) # Test that cluster aggrefate is updated. self.assertEqual({1: 2, 2: 4, 3: 4}, self.cluster1.aggregateVector) # Test that document is added to cluster documents. self.assertEqual(self.doc1, self.cluster1.documentsInCluster[self.doc1.docId]) self.cluster1.addDocument(self.doc2) self.assertEqual(3, self.cluster1.vectorWeights) self.assertEqual({1: 2 / 3., 2: 8 / 3., 3: 4 / 3.}, self.cluster1) self.assertEqual({1: 2, 2: 8, 3: 4}, self.cluster1.aggregateVector) def test_iterateDocumentsInCluster(self): # Test normal iteration. self.cluster1.addDocument(self.doc1) self.cluster1.addDocument(self.doc2) self.assertEqual([self.docx, self.doc1, self.doc2], list(self.cluster1.iterateDocumentsInCluster())) self.assertEqual(3, self.cluster1.length) # Test removal of document from cluster, if the document is added to a different cluster. self.cluster2.addDocument(self.doc2) self.assertEqual([self.docx, self.doc1], list(self.cluster1.iterateDocumentsInCluster())) self.assertEqual(2, self.cluster1.length) self.assertEqual(2, len(self.cluster1.documentsInCluster)) self.assertEqual([self.docy, self.doc2], list(self.cluster2.iterateDocumentsInCluster())) self.assertEqual(2, self.cluster2.length) def test_iterateByAttribute(self): self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2) self.assertEqual([(self.cluster1, 'cluster_0'), (self.cluster2, 'cluster_1')], list( Cluster.iterateByAttribute( [self.cluster1, self.cluster2], 'clusterId'))) def test_filterClustersByAttribute(self): self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2) self.assertEqual([self.cluster1, self.cluster2], list( Cluster.getClustersByAttributeAndThreshold( [self.cluster1, self.cluster2], 'vectorWeights', 1))) self.assertEqual([], list( Cluster.getClustersByAttributeAndThreshold( [self.cluster1, self.cluster2], 'vectorWeights', 3))) self.assertEqual([self.cluster1, self.cluster2], list( Cluster.getClustersByAttributeAndThreshold( [self.cluster1, self.cluster2], 'vectorWeights', 3, Cluster.BELOW_THRESHOLD)))
# $Id$ # # pylint: disable-msg=E1101,W0612,W0142 # """superclass for all content-objects """ __version__ = "$Id$" # phython imports from classes import Cluster, Node, Ressource import MenuSystem as menusystem import hb_mini glbmanager = hb_mini.miniManager() cluster1 = Cluster("172.16.10.172", "hacluster", "ddd", "172.16.10.172", glbmanager) if(glbmanager.login(cluster1.ip, cluster1.user, cluster1.passwd) == True): print "Connected succesfully" else: print "Connection failure" exit() def printNodeNames(data): print "\n" for node in cluster1.getNodes(): print "Node: %s" % node.name def printActiveNodes(data): print "\n" for node in cluster1.getActiveNodes():
class ClusterTests(unittest.TestCase): def setUp(self): Cluster.clusterIdCounter = 0 self.docx = Document(1, {1:2,2:4}) self.docy = Document(2, {2:4}) self.cluster1 = Cluster(self.docx) self.cluster2 = Cluster(self.docy) self.doc1 = Document(3, Vector({3:4})) self.doc2 = Document(4, Vector({2:4})) def test_initialization(self): self.assertEqual('cluster_0', self.cluster1.clusterId) self.assertEqual('cluster_1', self.cluster2.clusterId) self.assertEqual(2, Cluster.clusterIdCounter) self.assertEqual([self.docx], list(self.cluster1.iterateDocumentsInCluster())) self.assertEqual([self.docy], list(self.cluster2.iterateDocumentsInCluster())) def test_addDocument(self): self.cluster1.addDocument(self.doc1) # Test if cluster id is set. self.assertEqual(self.cluster1.clusterId, self.doc1.clusterId) # Test that cluster mean is updated. self.assertEqual({1:2/2.,2:2.,3:2.}, self.cluster1) # Test that cluster aggrefate is updated. self.assertEqual({1:2,2:4,3:4}, self.cluster1.aggregateVector) # Test that document is added to cluster documents. self.assertEqual(self.doc1, self.cluster1.documentsInCluster[self.doc1.docId]) self.cluster1.addDocument(self.doc2) self.assertEqual(3, self.cluster1.vectorWeights) self.assertEqual({1:2/3.,2:8/3.,3:4/3.}, self.cluster1) self.assertEqual({1:2,2:8,3:4}, self.cluster1.aggregateVector) def test_iterateDocumentsInCluster(self): # Test normal iteration. self.cluster1.addDocument(self.doc1) self.cluster1.addDocument(self.doc2) self.assertEqual([self.docx, self.doc1, self.doc2], list(self.cluster1.iterateDocumentsInCluster())) self.assertEqual(3, self.cluster1.length) # Test removal of document from cluster, if the document is added to a different cluster. self.cluster2.addDocument(self.doc2) self.assertEqual([self.docx, self.doc1], list(self.cluster1.iterateDocumentsInCluster())) self.assertEqual(2, self.cluster1.length) self.assertEqual(2, len(self.cluster1.documentsInCluster)) self.assertEqual([self.docy, self.doc2], list(self.cluster2.iterateDocumentsInCluster())) self.assertEqual(2, self.cluster2.length) def test_iterateByAttribute(self): self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2) self.assertEqual([(self.cluster1, 'cluster_0'), (self.cluster2, 'cluster_1')], list(Cluster.iterateByAttribute([self.cluster1, self.cluster2], 'clusterId'))) def test_filterClustersByAttribute(self): self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2) self.assertEqual([self.cluster1, self.cluster2], list(Cluster.getClustersByAttributeAndThreshold([self.cluster1, self.cluster2], 'vectorWeights', 1))) self.assertEqual([], list(Cluster.getClustersByAttributeAndThreshold([self.cluster1, self.cluster2], 'vectorWeights', 3))) self.assertEqual([self.cluster1, self.cluster2], list(Cluster.getClustersByAttributeAndThreshold([self.cluster1, self.cluster2], 'vectorWeights', 3, Cluster.BELOW_THRESHOLD)))
def test_iterateByAttribute(self): self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2) self.assertEqual([(self.cluster1, 'cluster_0'), (self.cluster2, 'cluster_1')], list(Cluster.iterateByAttribute([self.cluster1, self.cluster2], 'clusterId')))
def test_filterClustersByAttribute(self): self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2) self.assertEqual([self.cluster1, self.cluster2], list(Cluster.getClustersByAttributeAndThreshold([self.cluster1, self.cluster2], 'vectorWeights', 1))) self.assertEqual([], list(Cluster.getClustersByAttributeAndThreshold([self.cluster1, self.cluster2], 'vectorWeights', 3))) self.assertEqual([self.cluster1, self.cluster2], list(Cluster.getClustersByAttributeAndThreshold([self.cluster1, self.cluster2], 'vectorWeights', 3, Cluster.BELOW_THRESHOLD)))