class EdgeContainer: """ Container class for Edge objects. """ def __init__(self): self.elements = [] self.clusters = Clusters() def add_edge(self, source_node, sink_nodes): """ Create and store a new Edge object for the given source and sink nodes. @type source_node: str @param source_node: A string representing the source node of a directed edge. @type sink_nodes: list @param sink_nodes: A list containing strings of each sink node. """ self.clusters.add(source_node, sink_nodes) edge = Edge(source_node, sink_nodes) self.elements.append(edge) def write_to_file(self, output): """ Write the output string of each file to the given text file. @type output: file @param output: An open text file to write file strings to. """ for edge in self.elements: edge_string = edge.output_string() output.write(edge_string) output.write("\n") self.clusters.write_statistics_to_file(output)
def test_merge_df(self): cluster = Clusters( "../tests/data/test_1k_cluster_data.csv", "dec0dedfeed1111111111111", "addedfeed000000000000000", "testCluster", ) cell_metadata_df = Annotations( self.CELL_METADATA_PATH, ["text/csv", "text/plain", "text/tab-separated-values"], ) cell_metadata_df.preprocess() cell_names_cell_metadata_df = np.asarray(cell_metadata_df.file["NAME"]) cell_names_cluster_df = np.asarray(cluster.file["NAME"]) # Cell names found in both cluster and metadata files common_cell_names = cell_names_cluster_df[ np.isin(cell_names_cluster_df, cell_names_cell_metadata_df) ] print(f"common cell names: {common_cell_names}") # Perform merge print(cluster.file[["NAME", "x", "y", "z"]]) cluster.merge_df(cluster.file[["NAME", "x", "y", "z"]], cell_metadata_df.file) # Ensure ONLY common cell names found in cell metadata file and cluster file # are in the newly merged df result = all( cell[0] in common_cell_names for cell in cluster.file["NAME"].values ) self.assertTrue( result, f"Merge was not performed correctly. Merge should be performed on 'NAME'", )
def main(): parallel = True num_processes = 16 functions = Functions() connect = functions.get_config() c = Clusters(connect, num_processes, parallel) TOTAL_TASKS.observe(2) c.process_clusters()
def test_validate_header_for_coordinate_values_false(self): """Ensures validate_header_for_coordinate_values returns false when coordinate is missing in header """ cluster = Clusters( "../tests/data/cluster_bad_missing_coordinate.txt", "5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337", "testCluster", ) self.assertFalse(cluster.validate_header_for_coordinate_values())
def test_validate_header_for_coordinate_values_true(self): """Ensures validate_header_for_coordinate_values returns true when coordintate value is in cluster file """ cluster = Clusters( "../tests/data/cluster_example.txt", "5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337", "testCluster", ) self.assertTrue(cluster.validate_header_for_coordinate_values())
def fit(self): i = self.start_lvl - self.curr_lvl X = np.asarray(list(zip(self.cur_cA, self.cD[i]))) if self.centers is not None: if self.union_level: self.centers = centnters_update(self.centers, self.__split_mask) else: self.centers = centnters_update(self.centers) self.k = len(self.centers) kmeans = KMeans(n_clusters=self.k, init=self.centers) else: kmeans = KMeans(n_clusters=self.k) kmeans.fit(X) self.labels = kmeans.predict(X) self.centers = kmeans.cluster_centers_ self.clasters = Clusters(self.data, self.labels, self.recunstruct) return Clusters(self.data, self.labels, self.recunstruct)
def diveg(f="data/weather.csv",loud=False): the=about.defaults() t=Tab(file=f) all = sorted(t.rows) for x in all: print(x.y()) return 1 c=Clusters(t,the,cols=t.cols.y, loud=loud) print([col.txt for col in t.cols.y]) all = sorted(c.all) for t1 in c.all: print(rs(t1.y())) for span in all[0].bins(all[-1],the): print(span) print(the)
def __pyp_means(self, freq, lamb, theta): ## INITIALIZATION N = len(values) m1 = np.dot(values, counts)/float(N) clusters = Clusters() clusters.assignments[m1] = [] clusters.c = 1 clusters.map = [] D_r = [] notConverged = True epsilon = .001 ## NEED WHILE LOOP IN CONVERGENCE while notConverged: old_clusters = clusters # main for loop for x in freq.keys(): d = dict() for cluster_mean in clusters.assignments.keys(): d[cluster_mean] = np.power((x - cluster_mean),2) #squared difference between x and cluster mean min_d = min(d.values()) new_cluster = min(d, key=d.get) #get cluster mean with minimum distance from x if min_d - theta > lamb - np.log(clusters.c)*theta: D_r.append(x) else: clusters = assign_cluster(x, new_cluster, clusters) clusters = re_cluster(D_r, clusters, freq, lamb, theta) #re-cluster the unclustered data set clusters = center_agglomeration(clusters, lamb, theta) #center agglomeration clusters = update(clusters, freq) #update c, re-compute cluster means notConverged = check_convergence(old_clusters, clusters, epsilon) return clusters
def cluster_events(self, clustering_type, features=None, **clustering_parameters): """Create a NILM.Clusters object as Meter attribute. Parameters ---------- clustering_type: string Name of a clustering function. This function will be used to cluster the events. Needs to be one of the keys of the dictionnary 'clustering_types' of NILM.Clusters object. clustering_parameters: dict (optional) Arguments to be passed as argument of the function which will be used to cluster the events. Arguments not informed will take the default value defined in the dictionnary 'clustering_types' of NILM.Clusters object. """ clusters = Clusters(clustering_type, **clustering_parameters) clusters.clustering(self, features) self.clusters_ = clusters print "Meter: events clustered!"
def test_non_coordinate_case_integrity(self): # if headers of this file change, the reference for the assert needs to be updated cluster = Clusters( "../tests/data/test_1k_cluster_data.csv", "dec0dedfeed1111111111111", "addedfeed000000000000000", "testCluster", ) assert cluster.headers == [ "NAME", "x", "y", "z", "CLUSTER", "SUBCLUSTER", "1", ], "cluster instantiation should only downcase coordinate header columns"
class KMeans: def __init__(self, coordinates, number_of_clusters): self.clusters = Clusters(number_of_clusters, coordinates.shape[1]) self.data = Data(coordinates, self.clusters) self.clusters.set_data(self.data) def start(self, termination_condition_threshold): self.data.update_memberships() while self.clusters.update_centers( ) >= termination_condition_threshold: self.data.update_memberships() return self.clusters.total_error() def plot_data(self): self.data.plot() self.clusters.plot() plt.show()
def test_cluster_type_inference(self): """Confirm consistency of type inference behavior in instantiated data frame Note: metadata has similar set of tests """ cluster = Clusters( "../tests/data/cluster_NA.txt", "addedfeed000000000000000", "dec0dedfeed1111111111111", "testCluster", ) # integers, empty cell and string as inputs for numeric annotation assert isinstance(cluster.file["NA_i_n_s__grp"]["group"][3], str), "empty cell -> NaN, expect coercion to string" # integers and empty cell as inputs for numeric annotation assert isinstance(cluster.file["NA_i_n_grp"]["group"][3], str), "empty cell -> NaN, expect coercion to string" # floats, empty cell and string as inputs for numeric annotation assert isinstance(cluster.file["NA_f_n_s__grp"]["group"][3], str), "empty cell -> NaN, expect coercion to string" # floats and empty cell as inputs for numeric annotation assert isinstance(cluster.file["NA_f_n_grp"]["group"][3], str), "empty cell -> NaN, expect coercion to string" # integers, empty cell and string as inputs for group annotation assert isinstance( cluster.file["NA_i_n_s__num"]["numeric"][3], float), "empty cell -> NaN that remains float (not coerced)" # floats, empty cell and string as inputs for group annotation assert isinstance( cluster.file["NA_f_n_s__num"]["numeric"][3], float), "empty cell -> NaN that remains float (not coerced)"
def main(): debug = Debugger() chrono = Chrono() universe = Universe(debug) source = Source(debug).get_source() bucket_chain = BucketChain(debug, chrono, universe, source) clusters = Clusters(debug, chrono, universe) algorithm = OnlineClustering(debug, universe, bucket_chain, clusters) while True: operation_time = time.time() if bucket_chain.is_updated(): universe.compute_log_n_df() bucket_chain.compute_universal_counts() bucket_chain.compute_universal_tfidf() clusters.update_centroid_counts() clusters.update_centroid_tfidf() algorithm.pre_clustering_work() algorithm.online_clustering() clusters.remove_old_clusters() universe.prune_terms(clusters) debug.log("BUCKET FINISHED IN: " + str(time.time() - operation_time)) clusters.debug_active_clusters() clusters.save_active_clusters()
def __init__(self): self.elements = [] self.clusters = Clusters()
def loadClusters(): return Clusters.Clusters()
def __init__(self, coordinates, number_of_clusters): self.clusters = Clusters(number_of_clusters, coordinates.shape[1]) self.data = Data(coordinates, self.clusters) self.clusters.set_data(self.data)
def __init__(self, cluster_file = "/home/nlg-02/ar_009/paths"): self.clusters = Clusters.read_clusters(cluster_file)
def set_data_array(self, args, kwargs): return Clusters.set_data_array(*args, **kwargs)