예제 #1
0
 def setUp(self):
     self.mocked_consul = mock.MagicMock()
     self.cluster_patch = mock.patch(
         'cluster.cluster.Cluster.consul',
         new_callable=mock.PropertyMock(return_value=self.mocked_consul))
     self.cluster_patch.start()
     self.cluster = Cluster('http://fake.host')
예제 #2
0
    def get(self, dataset, method):
        if method not in ['kmeans', 'meanshift']:
            # invalid command, abort with 400
            abort(400)

        if dataset not in filename:
            # invalid dataset
            return {'status': 1, 'data': 'Invalid dataset.'}

        try:
            df = pd.read_csv('results/' + filename[dataset])
            cluster = Cluster(df)
            result = None
            # dispatch to corresponding cluster method
            if method == 'meanshift':
                result = cluster.meanShift()
            elif method == 'kmeans':
                result = cluster.kMeans()
            return {'status': 0, 'data': result}

        except EnvironmentError:  # parent of IOError, OSError *and* WindowsError where available
            return {
                'status': 1,
                'data': 'Dataset not ready or does not exist.'
            }
예제 #3
0
 def load(directory: Path) -> "ClusterReport":
     cluster_file = directory / CLUSTER_FILENAME
     with open(cluster_file) as f:
         cluster = Cluster.deserialize(f)
     monitoring = load_monitoring_data(directory, cluster)
     profiling_data = load_profiling_data(cluster)
     return ClusterReport(
         cluster=cluster,
         monitoring=monitoring,
         profiling_data=profiling_data,
         directory=directory,
     )
예제 #4
0
def cluster_cloud_function(request):

    import os, sys, json
    from models.user import User
    from cluster.cluster import Cluster

    request_json = request.get_json()

    cluster: Cluster = Cluster(number_of_clusters=1)

    cluster.cluster_users()

    return "Clusters have been created."
예제 #5
0
    def compute_average_user_performance(self):

        set_up()

        normalization: Normalization = Normalization()
        user_keys: str = Database().get_user_keys()

        for user_key in user_keys:

            normalization.calculate_average_performance(user_key)

        Cluster(1).cluster_users()
        clean_up()
예제 #6
0
def load_monitoring_data(directory: Path, cluster: Cluster) -> MonitoringData:
    data = {}

    for (node, process) in cluster.processes():
        if "monitor" == process.key:
            trace_file = node_monitoring_trace(directory, node.hostname)
            if trace_file.exists():
                with open(trace_file) as f:
                    data[node] = MonitoringRecord.deserialize_records(f)
            else:
                logging.warning(
                    f"Monitoring trace for {node.hostname} not found at {trace_file}"
                )
    return data
예제 #7
0
    def setUp(self):
        self.mocked_consul = mock.MagicMock()
        self.cluster_patch = mock.patch(
            'cluster.cluster.Cluster.consul',
            new_callable=mock.PropertyMock(return_value=self.mocked_consul)
        )

        self.mocked_consul.configure_mock(**{
            'catalog.nodes.return_value': [
                {'Node': 'node-1', },
                {'Node': 'node-2', },
                {'Node': 'node-3', },
                {'Node': 'node-4', },
            ]
        })
        self.cluster_patch.start()
        self.cluster = Cluster('http://fake.host')
예제 #8
0
def load_profiling_data(cluster: Cluster) -> ProfilingData:
    data = {}

    for (_, process) in cluster.processes():
        if PROFILER_METADATA_KEY in process.metadata:
            records = process.metadata[PROFILER_METADATA_KEY]
            process_records = {}

            for (tag, file) in records.items():
                file = Path(file)
                if file.is_file():
                    process_records[tag] = file
                else:
                    logging.warning(
                        f"Profiler record `{tag}` for `{process.key}` not found at {file}"
                    )
            data[process.key] = process_records
    return data
    def cluster(self, matrix=None, level=None, sequence=None):
        """
        Perform hierarchical clustering.

        :param matrix: The 2D list that is currently under processing. The
            matrix contains the distances of each item with each other
        :param level: The current level of clustering
        :param sequence: The sequence number of the clustering
        """
        logger.info("Performing cluster()")

        if matrix is None:
            # create level 0, first iteration (sequence)
            level = 0
            sequence = 0
            matrix = []

        # if the matrix only has two rows left, we are done
        linkage = partial(self.linkage, distance_function=self.distance)
        initial_element_count = len(self._data)
        while len(matrix) > 2 or matrix == []:

            item_item_matrix = Matrix(self._data, linkage, True, 0)
            item_item_matrix.genmatrix(self.num_processes)
            matrix = item_item_matrix.matrix

            smallestpair = None
            mindistance = None
            rowindex = 0  # keep track of where we are in the matrix
            # find the minimum distance
            for row in matrix:
                cellindex = 0  # keep track of where we are in the matrix
                for cell in row:
                    # if we are not on the diagonal (which is always 0)
                    # and if this cell represents a new minimum...
                    cell_lt_mdist = cell < mindistance if mindistance else False
                    if ((rowindex != cellindex)
                            and (cell_lt_mdist or smallestpair is None)):
                        smallestpair = (rowindex, cellindex)
                        mindistance = cell
                    cellindex += 1
                rowindex += 1

            sequence += 1
            level = matrix[smallestpair[1]][smallestpair[0]]
            cluster = Cluster(level, self._data[smallestpair[0]],
                              self._data[smallestpair[1]])

            # maintain the data, by combining the the two most similar items
            # in the list we use the min and max functions to ensure the
            # integrity of the data.  imagine: if we first remove the item
            # with the smaller index, all the rest of the items shift down by
            # one. So the next index will be wrong. We could simply adjust the
            # value of the second "remove" call, but we don't know the order
            # in which they come. The max and min approach clarifies that
            self._data.remove(self._data[max(
                smallestpair[0], smallestpair[1])])  # remove item 1
            self._data.remove(self._data[min(
                smallestpair[0], smallestpair[1])])  # remove item 2
            self._data.append(cluster)  # append item 1 and 2 combined

            self.publish_progress(initial_element_count, len(self._data))

        # all the data is in one single cluster. We return that and stop
        self.__cluster_created = True
        logger.info("Call to cluster() is complete")
        return
예제 #10
0
 def test_nodes(self):
     self.cluster = Cluster()
     self.assertEqual(self.cluster.nodes,
                      ['node-1', 'node-2', 'node-3', 'node-4'])
예제 #11
0
import pandas as pd


def DTWDistance(s1, s2, w: float = np.inf):
    DTW = {}

    w = max(w, abs(len(s1) - len(s2)))

    for i in range(-1, len(s1)):
        for j in range(-1, len(s2)):
            DTW[(i, j)] = float("inf")
    DTW[(-1, -1)] = 0

    for i in range(len(s1)):
        for j in range(max(0, i - w), min(len(s2), i + w)):
            dist = (s1[i] - s2[j]) ** 2
            DTW[(i, j)] = dist + min(
                DTW[(i - 1, j)], DTW[(i, j - 1)], DTW[(i - 1, j - 1)]
            )

    return sqrt(DTW[len(s1) - 1, len(s2) - 1])


if __name__ == "__main__":
    data = np.array(pd.read_csv("data/data1.csv").head(10))[(0, 1, 3, 5, 6, 8), 1:]
    print(data)
    print(data.shape)
    clust = Cluster(data, metric=DTWDistance)
    clust.print(2)
    clust.dendogram()
예제 #12
0
def get_processes_by_hostname(cluster: Cluster, hostname: str) -> Iterator[ProcessInfo]:
    for (_, process) in cluster.get_processes(hostname=hostname):
        if is_valid_process(process):
            yield process
예제 #13
0
    def __init__(self, cluster_info: ClusterInfo, workdir: Path):
        self.cluster_info = cluster_info
        self.workdir = workdir
        self.workdir.mkdir(exist_ok=True, parents=True)

        self.cluster = Cluster(str(self.workdir))
예제 #14
0
class ClusterHelper:
    def __init__(self, cluster_info: ClusterInfo, workdir: Path):
        self.cluster_info = cluster_info
        self.workdir = workdir
        self.workdir.mkdir(exist_ok=True, parents=True)

        self.cluster = Cluster(str(self.workdir))

    @property
    def active_nodes(self) -> List[str]:
        return list(self.cluster.nodes.keys())

    @property
    def processes(self) -> List[Process]:
        processes = []
        for node in self.cluster.nodes.values():
            processes += node.processes
        return processes

    def commit(self):
        with open(self.workdir / CLUSTER_FILENAME, "w") as f:
            self.cluster.serialize(f)

    def stop(self, use_sigint=False):
        start = time.time()

        fn = functools.partial(kill_fn, use_sigint)
        self.cluster.kill(fn)
        logging.debug(f"Cluster killed in {time.time() - start} seconds")

    def start_processes(self, processes: List[StartProcessArgs]):
        def prepare_workdir(workdir: Path) -> Path:
            workdir = workdir if workdir else self.workdir
            workdir.mkdir(parents=True, exist_ok=True)
            return workdir.absolute()

        pool_args = [
            dataclasses.replace(args, workdir=prepare_workdir(args.workdir))
            for args in processes
        ]

        logging.debug(f"Starting cluster processes: {pool_args}")

        for process in pool_args:
            logging.debug(f"Command: {' '.join(process.args)}")
        spawned = []
        if len(pool_args) == 1:
            spawned.append(start_process_pool(pool_args[0]))
        else:
            with Pool() as pool:
                for res in pool.map(start_process_pool, pool_args):
                    spawned.append(res)

        for (process, args) in zip(spawned, pool_args):
            self.cluster.add(process=process, key=args.name, **args.metadata)

    def start_monitoring(self, nodes: List[str], observe_processes=False):
        if not self.cluster_info.monitor_nodes:
            return

        init_cmd = []
        pyenv = get_pyenv_from_env()
        if pyenv:
            init_cmd += [f"source {pyenv}/bin/activate"]
        else:
            logging.warning(
                "No Python virtualenv detected. Monitoring will probably not work."
            )

        nodes = sorted(set(nodes))
        workdir = self.workdir / "monitoring"
        processes = []
        for node in nodes:
            args = [
                "python",
                str(MONITOR_SCRIPT_PATH),
                str(node_monitoring_trace(self.workdir, node)),
            ]
            if observe_processes:
                node_processes = self.cluster.get_processes(hostname=node)
                pids = [str(process.pid) for (_, process) in node_processes]
                args += ["--observe-pids", ",".join(pids)]
            process = StartProcessArgs(
                args=args,
                hostname=node,
                name="monitor",
                workdir=workdir,
                init_cmd=init_cmd,
            )
            processes.append(process)
        self.start_processes(processes)
예제 #15
0
class TestChecks(TestCase):
    def setUp(self):
        self.mocked_consul = mock.MagicMock()
        self.cluster_patch = mock.patch(
            'cluster.cluster.Cluster.consul',
            new_callable=mock.PropertyMock(return_value=self.mocked_consul))
        self.cluster_patch.start()
        self.cluster = Cluster('http://fake.host')

    def tearDown(self):
        self.cluster_patch.stop()

    def test_checks_empty_result(self):
        self.assertFalse(self.cluster.checks())

    def fill_data(self):
        def consul_health_state(state):
            if state == 'passing':
                return [{
                    'Node': 'node-1',
                    'ServiceID': 'service-1',
                    'ServiceName': 'Service 1',
                    'Status': state,
                    'Output': "check output",
                    'Name': "Check Service 1",
                }, {
                    'Node': 'node-2',
                    'ServiceID': 'service-2',
                    'ServiceName': 'Service 2',
                    'Status': state,
                    'Output': "check output",
                    'Name': "Check Service 2",
                }, {
                    'Node': 'node-2',
                    'ServiceID': 'service-2',
                    'ServiceName': 'Service 2',
                    'Status': state,
                    'Output': "check output 2",
                    'Name': "Check Service 2.2",
                }]
            elif state == 'critical':
                return [{
                    'Node': 'node-2',
                    'ServiceID': 'service-3',
                    'ServiceName': 'Service 3',
                    'Status': state,
                    'Output': "check output error",
                    'Name': "Check Service 3",
                }]
            else:
                return []

        self.mocked_consul.configure_mock(
            **{
                'health.state.side_effect': consul_health_state,
            })

    def test_checks_all(self):
        self.fill_data()
        self.maxDiff = None
        self.assertEqual(
            self.cluster.checks(all=True), {
                'node-1': {
                    'service-1': {
                        'name':
                        "Service 1",
                        'checks': [
                            (
                                'Check Service 1',
                                'passing',
                                'check output',
                            ),
                        ],
                    },
                },
                'node-2': {
                    'service-2': {
                        'name':
                        "Service 2",
                        'checks': [
                            (
                                'Check Service 2',
                                'passing',
                                'check output',
                            ),
                            (
                                'Check Service 2.2',
                                'passing',
                                'check output 2',
                            ),
                        ],
                    },
                    'service-3': {
                        'name':
                        "Service 3",
                        'checks': [
                            (
                                'Check Service 3',
                                'critical',
                                'check output error',
                            ),
                        ],
                    },
                },
            })

    def test_checks_warn(self):
        self.fill_data()
        self.assertEqual(
            self.cluster.checks(all=False), {
                'node-2': {
                    'service-3': {
                        'name':
                        "Service 3",
                        'checks': [
                            (
                                'Check Service 3',
                                'critical',
                                'check output error',
                            ),
                        ],
                    },
                },
            })

    def test_check_command_lines(self):
        self.fill_data()
        with OutputCapture() as output:
            with mock.patch('sys.argv', [
                    'cluster',
                    'checks',
            ]):
                main()
                output.compare("\n".join([
                    "Node node-2:",
                    " - Service Service 3:",
                    "    - Cehck (critical): Check Service 3",
                ]))
def basic_features_extract(data):
    return extract_features(data, column_id="id", column_sort="time")


def extract_features_from_TS(Data, y):
    extracted_features = basic_features_extract(Data)
    impute(extracted_features)
    # features_filtered = select_features(extracted_features, y)
    features_filtered_direct = extract_relevant_features(
        Data, y, column_id="id", column_sort="time"
    )
    return extracted_features, features_filtered_direct


if __name__ == "__main__":
    n_series = 10
    n_clust = 4
    features = np.concatenate(
        [np.loadtxt(f"data/f{i}.csv") for i in range(1, 4)], axis=0
    )[:n_series]
    features = features[(0, 1, 3, 5, 6, 8), :]
    print(f"Data recive : {features.shape}")
    clust = Cluster(features)
    print("Cluster initialized :)")
    lengths = list(map(len, clust.get(n_series)))
    plt.plot(list(range(len(lengths))), lengths)
    plt.show()
    clust.print(n_clust)
    clust.dendogram()
예제 #17
0
 def init(args):
     return Cluster(
         args.consul
     )
예제 #18
0
def cluster_all(data,
                imputer=None,
                name='',
                show=True,
                save=True,
                form='png',
                figsize=(10, 8)):
    method = [
        'single', 'complete', 'average', 'weighted', 'centroid', 'median',
        'ward'
    ]
    metric = [
        'hamming', 'hamming', 'hamming', 'hamming', 'euclidean', 'euclidean',
        'euclidean'
    ]

    method_names = ["Standarded"
                    ] + [f"{method[i]}({metric[i]})" for i in range(7)]

    method_rate_mat = np.zeros((8, 8), dtype=float)

    method_rate_mat[0,
                    0] = rf_distance(standarded_split_tree,
                                     standarded_split_tree,
                                     (len(standarded_split_tree[0]) - 1) // 2)

    for i in range(1, 8):
        c1 = Cluster(data,
                     method=method[i - 1],
                     metric=metric[i - 1],
                     imputer=imputer)
        method_rate_mat[0, i] = c1.rf_distance(standarded_split_tree)
        for j in range(1, 8):
            if i <= j:
                c2 = Cluster(data,
                             method=method[j - 1],
                             metric=metric[j - 1],
                             imputer=imputer)
                method_rate_mat[i, j] = c1.rf_distance(c2)

    rate_mate = pd.DataFrame(method_rate_mat + method_rate_mat.T,
                             index=method_names,
                             columns=method_names)

    fig, ax = plt.subplots(figsize=figsize)

    sns.heatmap(
        rate_mate,
        ax=ax,
        annot=True,
        fmt='.2f',
        center=0,
        cmap="Spectral",
    )

    # ax.tick_params(axis='x', rotation=30, ha="right")
    plt.setp(
        ax.get_xticklabels(),
        rotation=30,
        ha="right",
        rotation_mode="anchor",
        fontsize=15,
    )
    plt.setp(ax.get_yticklabels(), fontsize=15)

    if save is True:
        plt.savefig(f"{name}_cluster_mat.{form}", dpi=120)
    if show is True:
        plt.show()
    plt.clf()

    return rate_mate