Beispiel #1
0
    def get(self, dataset, method):
        if method not in ['kmeans', 'meanshift']:
            # invalid command, abort with 400
            abort(400)

        if dataset not in filename:
            # invalid dataset
            return {'status': 1, 'data': 'Invalid dataset.'}

        try:
            df = pd.read_csv('results/' + filename[dataset])
            cluster = Cluster(df)
            result = None
            # dispatch to corresponding cluster method
            if method == 'meanshift':
                result = cluster.meanShift()
            elif method == 'kmeans':
                result = cluster.kMeans()
            return {'status': 0, 'data': result}

        except EnvironmentError:  # parent of IOError, OSError *and* WindowsError where available
            return {
                'status': 1,
                'data': 'Dataset not ready or does not exist.'
            }
Beispiel #2
0
 def setUp(self):
     self.mocked_consul = mock.MagicMock()
     self.cluster_patch = mock.patch(
         'cluster.cluster.Cluster.consul',
         new_callable=mock.PropertyMock(return_value=self.mocked_consul))
     self.cluster_patch.start()
     self.cluster = Cluster('http://fake.host')
Beispiel #3
0
def cluster_cloud_function(request):

    import os, sys, json
    from models.user import User
    from cluster.cluster import Cluster

    request_json = request.get_json()

    cluster: Cluster = Cluster(number_of_clusters=1)

    cluster.cluster_users()

    return "Clusters have been created."
Beispiel #4
0
    def compute_average_user_performance(self):

        set_up()

        normalization: Normalization = Normalization()
        user_keys: str = Database().get_user_keys()

        for user_key in user_keys:

            normalization.calculate_average_performance(user_key)

        Cluster(1).cluster_users()
        clean_up()
    def setUp(self):
        self.mocked_consul = mock.MagicMock()
        self.cluster_patch = mock.patch(
            'cluster.cluster.Cluster.consul',
            new_callable=mock.PropertyMock(return_value=self.mocked_consul)
        )

        self.mocked_consul.configure_mock(**{
            'catalog.nodes.return_value': [
                {'Node': 'node-1', },
                {'Node': 'node-2', },
                {'Node': 'node-3', },
                {'Node': 'node-4', },
            ]
        })
        self.cluster_patch.start()
        self.cluster = Cluster('http://fake.host')
    def cluster(self, matrix=None, level=None, sequence=None):
        """
        Perform hierarchical clustering.

        :param matrix: The 2D list that is currently under processing. The
            matrix contains the distances of each item with each other
        :param level: The current level of clustering
        :param sequence: The sequence number of the clustering
        """
        logger.info("Performing cluster()")

        if matrix is None:
            # create level 0, first iteration (sequence)
            level = 0
            sequence = 0
            matrix = []

        # if the matrix only has two rows left, we are done
        linkage = partial(self.linkage, distance_function=self.distance)
        initial_element_count = len(self._data)
        while len(matrix) > 2 or matrix == []:

            item_item_matrix = Matrix(self._data, linkage, True, 0)
            item_item_matrix.genmatrix(self.num_processes)
            matrix = item_item_matrix.matrix

            smallestpair = None
            mindistance = None
            rowindex = 0  # keep track of where we are in the matrix
            # find the minimum distance
            for row in matrix:
                cellindex = 0  # keep track of where we are in the matrix
                for cell in row:
                    # if we are not on the diagonal (which is always 0)
                    # and if this cell represents a new minimum...
                    cell_lt_mdist = cell < mindistance if mindistance else False
                    if ((rowindex != cellindex)
                            and (cell_lt_mdist or smallestpair is None)):
                        smallestpair = (rowindex, cellindex)
                        mindistance = cell
                    cellindex += 1
                rowindex += 1

            sequence += 1
            level = matrix[smallestpair[1]][smallestpair[0]]
            cluster = Cluster(level, self._data[smallestpair[0]],
                              self._data[smallestpair[1]])

            # maintain the data, by combining the the two most similar items
            # in the list we use the min and max functions to ensure the
            # integrity of the data.  imagine: if we first remove the item
            # with the smaller index, all the rest of the items shift down by
            # one. So the next index will be wrong. We could simply adjust the
            # value of the second "remove" call, but we don't know the order
            # in which they come. The max and min approach clarifies that
            self._data.remove(self._data[max(
                smallestpair[0], smallestpair[1])])  # remove item 1
            self._data.remove(self._data[min(
                smallestpair[0], smallestpair[1])])  # remove item 2
            self._data.append(cluster)  # append item 1 and 2 combined

            self.publish_progress(initial_element_count, len(self._data))

        # all the data is in one single cluster. We return that and stop
        self.__cluster_created = True
        logger.info("Call to cluster() is complete")
        return
Beispiel #7
0
 def test_nodes(self):
     self.cluster = Cluster()
     self.assertEqual(self.cluster.nodes,
                      ['node-1', 'node-2', 'node-3', 'node-4'])
Beispiel #8
0
import pandas as pd


def DTWDistance(s1, s2, w: float = np.inf):
    DTW = {}

    w = max(w, abs(len(s1) - len(s2)))

    for i in range(-1, len(s1)):
        for j in range(-1, len(s2)):
            DTW[(i, j)] = float("inf")
    DTW[(-1, -1)] = 0

    for i in range(len(s1)):
        for j in range(max(0, i - w), min(len(s2), i + w)):
            dist = (s1[i] - s2[j]) ** 2
            DTW[(i, j)] = dist + min(
                DTW[(i - 1, j)], DTW[(i, j - 1)], DTW[(i - 1, j - 1)]
            )

    return sqrt(DTW[len(s1) - 1, len(s2) - 1])


if __name__ == "__main__":
    data = np.array(pd.read_csv("data/data1.csv").head(10))[(0, 1, 3, 5, 6, 8), 1:]
    print(data)
    print(data.shape)
    clust = Cluster(data, metric=DTWDistance)
    clust.print(2)
    clust.dendogram()
Beispiel #9
0
    def __init__(self, cluster_info: ClusterInfo, workdir: Path):
        self.cluster_info = cluster_info
        self.workdir = workdir
        self.workdir.mkdir(exist_ok=True, parents=True)

        self.cluster = Cluster(str(self.workdir))
def basic_features_extract(data):
    return extract_features(data, column_id="id", column_sort="time")


def extract_features_from_TS(Data, y):
    extracted_features = basic_features_extract(Data)
    impute(extracted_features)
    # features_filtered = select_features(extracted_features, y)
    features_filtered_direct = extract_relevant_features(
        Data, y, column_id="id", column_sort="time"
    )
    return extracted_features, features_filtered_direct


if __name__ == "__main__":
    n_series = 10
    n_clust = 4
    features = np.concatenate(
        [np.loadtxt(f"data/f{i}.csv") for i in range(1, 4)], axis=0
    )[:n_series]
    features = features[(0, 1, 3, 5, 6, 8), :]
    print(f"Data recive : {features.shape}")
    clust = Cluster(features)
    print("Cluster initialized :)")
    lengths = list(map(len, clust.get(n_series)))
    plt.plot(list(range(len(lengths))), lengths)
    plt.show()
    clust.print(n_clust)
    clust.dendogram()
Beispiel #11
0
 def init(args):
     return Cluster(
         args.consul
     )
Beispiel #12
0
def cluster_all(data,
                imputer=None,
                name='',
                show=True,
                save=True,
                form='png',
                figsize=(10, 8)):
    method = [
        'single', 'complete', 'average', 'weighted', 'centroid', 'median',
        'ward'
    ]
    metric = [
        'hamming', 'hamming', 'hamming', 'hamming', 'euclidean', 'euclidean',
        'euclidean'
    ]

    method_names = ["Standarded"
                    ] + [f"{method[i]}({metric[i]})" for i in range(7)]

    method_rate_mat = np.zeros((8, 8), dtype=float)

    method_rate_mat[0,
                    0] = rf_distance(standarded_split_tree,
                                     standarded_split_tree,
                                     (len(standarded_split_tree[0]) - 1) // 2)

    for i in range(1, 8):
        c1 = Cluster(data,
                     method=method[i - 1],
                     metric=metric[i - 1],
                     imputer=imputer)
        method_rate_mat[0, i] = c1.rf_distance(standarded_split_tree)
        for j in range(1, 8):
            if i <= j:
                c2 = Cluster(data,
                             method=method[j - 1],
                             metric=metric[j - 1],
                             imputer=imputer)
                method_rate_mat[i, j] = c1.rf_distance(c2)

    rate_mate = pd.DataFrame(method_rate_mat + method_rate_mat.T,
                             index=method_names,
                             columns=method_names)

    fig, ax = plt.subplots(figsize=figsize)

    sns.heatmap(
        rate_mate,
        ax=ax,
        annot=True,
        fmt='.2f',
        center=0,
        cmap="Spectral",
    )

    # ax.tick_params(axis='x', rotation=30, ha="right")
    plt.setp(
        ax.get_xticklabels(),
        rotation=30,
        ha="right",
        rotation_mode="anchor",
        fontsize=15,
    )
    plt.setp(ax.get_yticklabels(), fontsize=15)

    if save is True:
        plt.savefig(f"{name}_cluster_mat.{form}", dpi=120)
    if show is True:
        plt.show()
    plt.clf()

    return rate_mate