Python preprocess 예제들, clustering.preprocess Python 예제들

예제 #1

0

파일 보기

파일: main.py 프로젝트: googleinterns/cloud-monitoring-accessible-charts

def tune_parameters(algorithm, similarity, label_encoding, chart_id):
    """Returns a list with the results of using different parameters
    for algorithm run on the chart chart_id.

    Args:
        algorithm: The algorithm used for clustering. Must be "k-means"
            or "dbscan".
        similarity: The similarity measure used for scaling the data
            before clustering. Must be "proximity" or "correlation".
        label_encoding: The method used for encoding the labels. Must
            be "None" or "One-Hot".
        chart_id: The id of the file containing the data that k-means
            clustering is run on.
    """
    data = load_data(chart_id)
    if "timeSeries" not in data:
        return data
    time_series_data, _, ts_to_labels, _, _ = clustering.time_series_array(
        data, None)
    time_series_data = clustering.preprocess(time_series_data, label_encoding,
                                             similarity, ts_to_labels,
                                             algorithm)
    if algorithm == "k-means":
        distances = clustering.tuning_k(time_series_data)
    else:
        distances = clustering.tuning_eps(time_series_data)
    return str(distances)

예제 #2

0

파일 보기

 def test_preprocess_none_proximity(self):
     """Data should not be changed."""
     data = [[1.883, 2.9374874, 3.927837, -1],
             [5.282929, -1, 4.28983738, 3.98198],
             [8.982978738, 5.9289227, 0, 3.938383],
             [-1, 3.7988, 4.929278, 4.9389]]
     instance_labels = [[0, 1], [0, 1], [0, 0], [1, 0]]
     result = clustering.preprocess(np.array(data), "none", "proximity",
                                    instance_labels, "k-means")
     self.assertEqual(result.tolist(), data)

예제 #3

0

파일 보기

 def test_preprocess_one_hot_correlation(self):
     """Data should be shifted to 0 and the encoded labeled should be
     appended."""
     with open('./data/chart-101.json', "r") as json_file:
         data = json.load(json_file)
     np_data, label_dict, instance_labels, _, _ = clustering.time_series_array(
         data, None)
     result = clustering.preprocess(np_data, "one-hot", "correlation",
                                    instance_labels, "k-means")
     solution = [[1, 0, 0], [0, 10, 1], [0, 10, 1], [0, 10, 1]]
     self.assertEqual(result.tolist(), solution)

예제 #4

0

파일 보기

파일: main.py 프로젝트: googleinterns/cloud-monitoring-accessible-charts

def cluster(algorithm, similarity, encoding, outlier, rep, chart_id, key=None):
    """Returns the cluster each time series was placed in.

    Args:
        algorithm: The algorithm used for clustering. Must be "K-means"
            or "DBSCAN".
        similarity: The similarity measure used for scaling the data
            before clustering. Must be "Proximity" or "Correlation".
        encoding: The method used for encoding the labels. Must
            be "None" or "One-Hot".
        outlier: Whether outliers are identified, must be "on" or "off".
        rep: Whether the data is represented as "lines" or "bands".
        chart_id: The id of the file containing the data that k-means
            clustering is run on.
        key: The key for the time series labels that are saved. If None,
            then all label values may be kept, otherwise only label
            values with that key are kept.

    Returns:
        A json with a list containing the label of the cluster each
        time series was grouped in, and the min_max of each cluster and
        the corresponding dates for each value if rep == "bands",
        otheriwse and dates are empty lists.
    """
    data = load_data(chart_id)
    if "timeSeries" not in data:
        return data
    (time_series_data, label_dict, ts_to_labels, dates,
     old_range) = clustering.time_series_array(data, key)
    ts_data_updated = clustering.preprocess(time_series_data, encoding,
                                            similarity, ts_to_labels,
                                            algorithm)
    if algorithm == "k-means":
        labels = clustering.kmeans(ts_data_updated, outlier).tolist()
    elif algorithm == "k-means-constrained" or algorithm == "k-medians":
        labels = clustering.kmeans_kmedians(ts_data_updated, label_dict,
                                            ts_to_labels, algorithm,
                                            outlier).tolist()
    elif algorithm == "zone":
        labels = clustering.cluster_zone(label_dict, ts_to_labels)
    else:
        labels = clustering.dbscan(ts_data_updated, similarity, encoding,
                                   outlier).tolist()
    min_max, ordered_dates, outlier_indexes = [], [], []
    if rep == "bands":
        min_max, ordered_dates, outlier_indexes = clustering.clusters_min_max(
            time_series_data, labels, dates, old_range, outlier)
    return jsonify({
        "cluster_labels": labels,
        "min_max": min_max,
        "dates": ordered_dates,
        "outlier_indexes": outlier_indexes
    })

예제 #5

0

파일 보기

 def test_preprocess_none_correlation(self):
     """Should shift the data to zero."""
     data = [[1.883, 2.9374874, 3.927837, -1],
             [5.282929, -1, 4.28983738, 3.98198],
             [8.982978738, 5.9289227, 0, 3.938383],
             [1, 3.7988, 4.929278, 4.93081]]
     instance_labels = [[0], [0], [0], [0]]
     result = clustering.preprocess(np.array(data), "none", "correlation",
                                    instance_labels, "k-means")
     solution = [[2.883, 3.9374874, 4.927837, 0],
                 [6.282929, 0, 5.28983738, 4.98198],
                 [8.982978738, 5.9289227, 0, 3.938383],
                 [0, 2.7988, 3.929278, 3.93081]]
     self.assertEqual(result.tolist(), solution)

예제 #6

0

파일 보기

 def test_preprocess_one_hot_proximity(self):
     """Data should have the encoded labels appended."""
     data = [[1.883, 2.9374874, 3.927837, -1],
             [5.282929, -1, 4.28983738, 3.98198],
             [8.982978738, 5.9289227, 0, 3.938383],
             [-1, 3.9998, 4.929278, 4.9389]]
     instance_labels = [[0, 1], [0, 1], [0, 0], [1, 0]]
     result = clustering.preprocess(np.array(data), "one-hot", "proximity",
                                    np.array(instance_labels), "k-means")
     data = [[1.883, 2.9374874, 3.927837, -1, 0, 1],
             [5.282929, -1, 4.28983738, 3.98198, 0, 1],
             [8.982978738, 5.9289227, 0, 3.938383, 0, 0],
             [-1, 3.9998, 4.929278, 4.9389, 1, 0]]
     self.assertEqual(result.tolist(), data)

예제 #7

0

파일 보기

파일: main.py 프로젝트: googleinterns/cloud-monitoring-accessible-charts

def frequency(similarity, algorithm, label_encoding, chart_id):
    """Runs kmeans and gets the frequencies of labels per time series
    and labels per cluster.

    Args:
        similarity: The similarity measure used for scaling the data
            before clustering. Must be "proximity" or "correlation".
        label_encoding: The method used for encoding the labels. Must
            be "none" or "one-hot".
        chart_id: The id of the file containing the data that k-means
            clustering is run on.

    Returns:
        A json with a list of cluster labels generated by running
        kmeans, an array of labels per time series and an array of
        labels per cluster.

    """
    data = load_data(chart_id)
    if "timeSeries" not in data:
        return data
    (time_series_data, label_dict, ts_to_labels, _,
     _) = clustering.time_series_array(data, None)
    time_series_data = clustering.preprocess(time_series_data, label_encoding,
                                             similarity, ts_to_labels,
                                             "k-means")
    if algorithm == "k-means":
        labels = clustering.kmeans(time_series_data, "off")
    elif algorithm == "k-means-constrained":
        labels = clustering.kmeans_kmedians(time_series_data, label_dict,
                                            ts_to_labels, algorithm, "off")

    cluster_labels = clustering.cluster_to_labels(labels, ts_to_labels)

    ordered_labels, ordered_clusters, ordered_ts = clustering.sort_labels(
        label_dict, cluster_labels, ts_to_labels)

    return jsonify({
        "labels": ordered_labels,
        "ts_labels": ordered_ts.tolist(),
        "cluster_labels": ordered_clusters.tolist()
    })