def setUp(self): self.mocked_consul = mock.MagicMock() self.cluster_patch = mock.patch( 'cluster.cluster.Cluster.consul', new_callable=mock.PropertyMock(return_value=self.mocked_consul)) self.cluster_patch.start() self.cluster = Cluster('http://fake.host')
def get(self, dataset, method): if method not in ['kmeans', 'meanshift']: # invalid command, abort with 400 abort(400) if dataset not in filename: # invalid dataset return {'status': 1, 'data': 'Invalid dataset.'} try: df = pd.read_csv('results/' + filename[dataset]) cluster = Cluster(df) result = None # dispatch to corresponding cluster method if method == 'meanshift': result = cluster.meanShift() elif method == 'kmeans': result = cluster.kMeans() return {'status': 0, 'data': result} except EnvironmentError: # parent of IOError, OSError *and* WindowsError where available return { 'status': 1, 'data': 'Dataset not ready or does not exist.' }
def load(directory: Path) -> "ClusterReport": cluster_file = directory / CLUSTER_FILENAME with open(cluster_file) as f: cluster = Cluster.deserialize(f) monitoring = load_monitoring_data(directory, cluster) profiling_data = load_profiling_data(cluster) return ClusterReport( cluster=cluster, monitoring=monitoring, profiling_data=profiling_data, directory=directory, )
def cluster_cloud_function(request): import os, sys, json from models.user import User from cluster.cluster import Cluster request_json = request.get_json() cluster: Cluster = Cluster(number_of_clusters=1) cluster.cluster_users() return "Clusters have been created."
def compute_average_user_performance(self): set_up() normalization: Normalization = Normalization() user_keys: str = Database().get_user_keys() for user_key in user_keys: normalization.calculate_average_performance(user_key) Cluster(1).cluster_users() clean_up()
def load_monitoring_data(directory: Path, cluster: Cluster) -> MonitoringData: data = {} for (node, process) in cluster.processes(): if "monitor" == process.key: trace_file = node_monitoring_trace(directory, node.hostname) if trace_file.exists(): with open(trace_file) as f: data[node] = MonitoringRecord.deserialize_records(f) else: logging.warning( f"Monitoring trace for {node.hostname} not found at {trace_file}" ) return data
def setUp(self): self.mocked_consul = mock.MagicMock() self.cluster_patch = mock.patch( 'cluster.cluster.Cluster.consul', new_callable=mock.PropertyMock(return_value=self.mocked_consul) ) self.mocked_consul.configure_mock(**{ 'catalog.nodes.return_value': [ {'Node': 'node-1', }, {'Node': 'node-2', }, {'Node': 'node-3', }, {'Node': 'node-4', }, ] }) self.cluster_patch.start() self.cluster = Cluster('http://fake.host')
def load_profiling_data(cluster: Cluster) -> ProfilingData: data = {} for (_, process) in cluster.processes(): if PROFILER_METADATA_KEY in process.metadata: records = process.metadata[PROFILER_METADATA_KEY] process_records = {} for (tag, file) in records.items(): file = Path(file) if file.is_file(): process_records[tag] = file else: logging.warning( f"Profiler record `{tag}` for `{process.key}` not found at {file}" ) data[process.key] = process_records return data
def cluster(self, matrix=None, level=None, sequence=None): """ Perform hierarchical clustering. :param matrix: The 2D list that is currently under processing. The matrix contains the distances of each item with each other :param level: The current level of clustering :param sequence: The sequence number of the clustering """ logger.info("Performing cluster()") if matrix is None: # create level 0, first iteration (sequence) level = 0 sequence = 0 matrix = [] # if the matrix only has two rows left, we are done linkage = partial(self.linkage, distance_function=self.distance) initial_element_count = len(self._data) while len(matrix) > 2 or matrix == []: item_item_matrix = Matrix(self._data, linkage, True, 0) item_item_matrix.genmatrix(self.num_processes) matrix = item_item_matrix.matrix smallestpair = None mindistance = None rowindex = 0 # keep track of where we are in the matrix # find the minimum distance for row in matrix: cellindex = 0 # keep track of where we are in the matrix for cell in row: # if we are not on the diagonal (which is always 0) # and if this cell represents a new minimum... cell_lt_mdist = cell < mindistance if mindistance else False if ((rowindex != cellindex) and (cell_lt_mdist or smallestpair is None)): smallestpair = (rowindex, cellindex) mindistance = cell cellindex += 1 rowindex += 1 sequence += 1 level = matrix[smallestpair[1]][smallestpair[0]] cluster = Cluster(level, self._data[smallestpair[0]], self._data[smallestpair[1]]) # maintain the data, by combining the the two most similar items # in the list we use the min and max functions to ensure the # integrity of the data. imagine: if we first remove the item # with the smaller index, all the rest of the items shift down by # one. So the next index will be wrong. We could simply adjust the # value of the second "remove" call, but we don't know the order # in which they come. The max and min approach clarifies that self._data.remove(self._data[max( smallestpair[0], smallestpair[1])]) # remove item 1 self._data.remove(self._data[min( smallestpair[0], smallestpair[1])]) # remove item 2 self._data.append(cluster) # append item 1 and 2 combined self.publish_progress(initial_element_count, len(self._data)) # all the data is in one single cluster. We return that and stop self.__cluster_created = True logger.info("Call to cluster() is complete") return
def test_nodes(self): self.cluster = Cluster() self.assertEqual(self.cluster.nodes, ['node-1', 'node-2', 'node-3', 'node-4'])
import pandas as pd def DTWDistance(s1, s2, w: float = np.inf): DTW = {} w = max(w, abs(len(s1) - len(s2))) for i in range(-1, len(s1)): for j in range(-1, len(s2)): DTW[(i, j)] = float("inf") DTW[(-1, -1)] = 0 for i in range(len(s1)): for j in range(max(0, i - w), min(len(s2), i + w)): dist = (s1[i] - s2[j]) ** 2 DTW[(i, j)] = dist + min( DTW[(i - 1, j)], DTW[(i, j - 1)], DTW[(i - 1, j - 1)] ) return sqrt(DTW[len(s1) - 1, len(s2) - 1]) if __name__ == "__main__": data = np.array(pd.read_csv("data/data1.csv").head(10))[(0, 1, 3, 5, 6, 8), 1:] print(data) print(data.shape) clust = Cluster(data, metric=DTWDistance) clust.print(2) clust.dendogram()
def get_processes_by_hostname(cluster: Cluster, hostname: str) -> Iterator[ProcessInfo]: for (_, process) in cluster.get_processes(hostname=hostname): if is_valid_process(process): yield process
def __init__(self, cluster_info: ClusterInfo, workdir: Path): self.cluster_info = cluster_info self.workdir = workdir self.workdir.mkdir(exist_ok=True, parents=True) self.cluster = Cluster(str(self.workdir))
class ClusterHelper: def __init__(self, cluster_info: ClusterInfo, workdir: Path): self.cluster_info = cluster_info self.workdir = workdir self.workdir.mkdir(exist_ok=True, parents=True) self.cluster = Cluster(str(self.workdir)) @property def active_nodes(self) -> List[str]: return list(self.cluster.nodes.keys()) @property def processes(self) -> List[Process]: processes = [] for node in self.cluster.nodes.values(): processes += node.processes return processes def commit(self): with open(self.workdir / CLUSTER_FILENAME, "w") as f: self.cluster.serialize(f) def stop(self, use_sigint=False): start = time.time() fn = functools.partial(kill_fn, use_sigint) self.cluster.kill(fn) logging.debug(f"Cluster killed in {time.time() - start} seconds") def start_processes(self, processes: List[StartProcessArgs]): def prepare_workdir(workdir: Path) -> Path: workdir = workdir if workdir else self.workdir workdir.mkdir(parents=True, exist_ok=True) return workdir.absolute() pool_args = [ dataclasses.replace(args, workdir=prepare_workdir(args.workdir)) for args in processes ] logging.debug(f"Starting cluster processes: {pool_args}") for process in pool_args: logging.debug(f"Command: {' '.join(process.args)}") spawned = [] if len(pool_args) == 1: spawned.append(start_process_pool(pool_args[0])) else: with Pool() as pool: for res in pool.map(start_process_pool, pool_args): spawned.append(res) for (process, args) in zip(spawned, pool_args): self.cluster.add(process=process, key=args.name, **args.metadata) def start_monitoring(self, nodes: List[str], observe_processes=False): if not self.cluster_info.monitor_nodes: return init_cmd = [] pyenv = get_pyenv_from_env() if pyenv: init_cmd += [f"source {pyenv}/bin/activate"] else: logging.warning( "No Python virtualenv detected. Monitoring will probably not work." ) nodes = sorted(set(nodes)) workdir = self.workdir / "monitoring" processes = [] for node in nodes: args = [ "python", str(MONITOR_SCRIPT_PATH), str(node_monitoring_trace(self.workdir, node)), ] if observe_processes: node_processes = self.cluster.get_processes(hostname=node) pids = [str(process.pid) for (_, process) in node_processes] args += ["--observe-pids", ",".join(pids)] process = StartProcessArgs( args=args, hostname=node, name="monitor", workdir=workdir, init_cmd=init_cmd, ) processes.append(process) self.start_processes(processes)
class TestChecks(TestCase): def setUp(self): self.mocked_consul = mock.MagicMock() self.cluster_patch = mock.patch( 'cluster.cluster.Cluster.consul', new_callable=mock.PropertyMock(return_value=self.mocked_consul)) self.cluster_patch.start() self.cluster = Cluster('http://fake.host') def tearDown(self): self.cluster_patch.stop() def test_checks_empty_result(self): self.assertFalse(self.cluster.checks()) def fill_data(self): def consul_health_state(state): if state == 'passing': return [{ 'Node': 'node-1', 'ServiceID': 'service-1', 'ServiceName': 'Service 1', 'Status': state, 'Output': "check output", 'Name': "Check Service 1", }, { 'Node': 'node-2', 'ServiceID': 'service-2', 'ServiceName': 'Service 2', 'Status': state, 'Output': "check output", 'Name': "Check Service 2", }, { 'Node': 'node-2', 'ServiceID': 'service-2', 'ServiceName': 'Service 2', 'Status': state, 'Output': "check output 2", 'Name': "Check Service 2.2", }] elif state == 'critical': return [{ 'Node': 'node-2', 'ServiceID': 'service-3', 'ServiceName': 'Service 3', 'Status': state, 'Output': "check output error", 'Name': "Check Service 3", }] else: return [] self.mocked_consul.configure_mock( **{ 'health.state.side_effect': consul_health_state, }) def test_checks_all(self): self.fill_data() self.maxDiff = None self.assertEqual( self.cluster.checks(all=True), { 'node-1': { 'service-1': { 'name': "Service 1", 'checks': [ ( 'Check Service 1', 'passing', 'check output', ), ], }, }, 'node-2': { 'service-2': { 'name': "Service 2", 'checks': [ ( 'Check Service 2', 'passing', 'check output', ), ( 'Check Service 2.2', 'passing', 'check output 2', ), ], }, 'service-3': { 'name': "Service 3", 'checks': [ ( 'Check Service 3', 'critical', 'check output error', ), ], }, }, }) def test_checks_warn(self): self.fill_data() self.assertEqual( self.cluster.checks(all=False), { 'node-2': { 'service-3': { 'name': "Service 3", 'checks': [ ( 'Check Service 3', 'critical', 'check output error', ), ], }, }, }) def test_check_command_lines(self): self.fill_data() with OutputCapture() as output: with mock.patch('sys.argv', [ 'cluster', 'checks', ]): main() output.compare("\n".join([ "Node node-2:", " - Service Service 3:", " - Cehck (critical): Check Service 3", ]))
def basic_features_extract(data): return extract_features(data, column_id="id", column_sort="time") def extract_features_from_TS(Data, y): extracted_features = basic_features_extract(Data) impute(extracted_features) # features_filtered = select_features(extracted_features, y) features_filtered_direct = extract_relevant_features( Data, y, column_id="id", column_sort="time" ) return extracted_features, features_filtered_direct if __name__ == "__main__": n_series = 10 n_clust = 4 features = np.concatenate( [np.loadtxt(f"data/f{i}.csv") for i in range(1, 4)], axis=0 )[:n_series] features = features[(0, 1, 3, 5, 6, 8), :] print(f"Data recive : {features.shape}") clust = Cluster(features) print("Cluster initialized :)") lengths = list(map(len, clust.get(n_series))) plt.plot(list(range(len(lengths))), lengths) plt.show() clust.print(n_clust) clust.dendogram()
def init(args): return Cluster( args.consul )
def cluster_all(data, imputer=None, name='', show=True, save=True, form='png', figsize=(10, 8)): method = [ 'single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward' ] metric = [ 'hamming', 'hamming', 'hamming', 'hamming', 'euclidean', 'euclidean', 'euclidean' ] method_names = ["Standarded" ] + [f"{method[i]}({metric[i]})" for i in range(7)] method_rate_mat = np.zeros((8, 8), dtype=float) method_rate_mat[0, 0] = rf_distance(standarded_split_tree, standarded_split_tree, (len(standarded_split_tree[0]) - 1) // 2) for i in range(1, 8): c1 = Cluster(data, method=method[i - 1], metric=metric[i - 1], imputer=imputer) method_rate_mat[0, i] = c1.rf_distance(standarded_split_tree) for j in range(1, 8): if i <= j: c2 = Cluster(data, method=method[j - 1], metric=metric[j - 1], imputer=imputer) method_rate_mat[i, j] = c1.rf_distance(c2) rate_mate = pd.DataFrame(method_rate_mat + method_rate_mat.T, index=method_names, columns=method_names) fig, ax = plt.subplots(figsize=figsize) sns.heatmap( rate_mate, ax=ax, annot=True, fmt='.2f', center=0, cmap="Spectral", ) # ax.tick_params(axis='x', rotation=30, ha="right") plt.setp( ax.get_xticklabels(), rotation=30, ha="right", rotation_mode="anchor", fontsize=15, ) plt.setp(ax.get_yticklabels(), fontsize=15) if save is True: plt.savefig(f"{name}_cluster_mat.{form}", dpi=120) if show is True: plt.show() plt.clf() return rate_mate