def tune_parameters(algorithm, similarity, label_encoding, chart_id): """Returns a list with the results of using different parameters for algorithm run on the chart chart_id. Args: algorithm: The algorithm used for clustering. Must be "k-means" or "dbscan". similarity: The similarity measure used for scaling the data before clustering. Must be "proximity" or "correlation". label_encoding: The method used for encoding the labels. Must be "None" or "One-Hot". chart_id: The id of the file containing the data that k-means clustering is run on. """ data = load_data(chart_id) if "timeSeries" not in data: return data time_series_data, _, ts_to_labels, _, _ = clustering.time_series_array( data, None) time_series_data = clustering.preprocess(time_series_data, label_encoding, similarity, ts_to_labels, algorithm) if algorithm == "k-means": distances = clustering.tuning_k(time_series_data) else: distances = clustering.tuning_eps(time_series_data) return str(distances)
def test_preprocess_none_proximity(self): """Data should not be changed.""" data = [[1.883, 2.9374874, 3.927837, -1], [5.282929, -1, 4.28983738, 3.98198], [8.982978738, 5.9289227, 0, 3.938383], [-1, 3.7988, 4.929278, 4.9389]] instance_labels = [[0, 1], [0, 1], [0, 0], [1, 0]] result = clustering.preprocess(np.array(data), "none", "proximity", instance_labels, "k-means") self.assertEqual(result.tolist(), data)
def test_preprocess_one_hot_correlation(self): """Data should be shifted to 0 and the encoded labeled should be appended.""" with open('./data/chart-101.json', "r") as json_file: data = json.load(json_file) np_data, label_dict, instance_labels, _, _ = clustering.time_series_array( data, None) result = clustering.preprocess(np_data, "one-hot", "correlation", instance_labels, "k-means") solution = [[1, 0, 0], [0, 10, 1], [0, 10, 1], [0, 10, 1]] self.assertEqual(result.tolist(), solution)
def cluster(algorithm, similarity, encoding, outlier, rep, chart_id, key=None): """Returns the cluster each time series was placed in. Args: algorithm: The algorithm used for clustering. Must be "K-means" or "DBSCAN". similarity: The similarity measure used for scaling the data before clustering. Must be "Proximity" or "Correlation". encoding: The method used for encoding the labels. Must be "None" or "One-Hot". outlier: Whether outliers are identified, must be "on" or "off". rep: Whether the data is represented as "lines" or "bands". chart_id: The id of the file containing the data that k-means clustering is run on. key: The key for the time series labels that are saved. If None, then all label values may be kept, otherwise only label values with that key are kept. Returns: A json with a list containing the label of the cluster each time series was grouped in, and the min_max of each cluster and the corresponding dates for each value if rep == "bands", otheriwse and dates are empty lists. """ data = load_data(chart_id) if "timeSeries" not in data: return data (time_series_data, label_dict, ts_to_labels, dates, old_range) = clustering.time_series_array(data, key) ts_data_updated = clustering.preprocess(time_series_data, encoding, similarity, ts_to_labels, algorithm) if algorithm == "k-means": labels = clustering.kmeans(ts_data_updated, outlier).tolist() elif algorithm == "k-means-constrained" or algorithm == "k-medians": labels = clustering.kmeans_kmedians(ts_data_updated, label_dict, ts_to_labels, algorithm, outlier).tolist() elif algorithm == "zone": labels = clustering.cluster_zone(label_dict, ts_to_labels) else: labels = clustering.dbscan(ts_data_updated, similarity, encoding, outlier).tolist() min_max, ordered_dates, outlier_indexes = [], [], [] if rep == "bands": min_max, ordered_dates, outlier_indexes = clustering.clusters_min_max( time_series_data, labels, dates, old_range, outlier) return jsonify({ "cluster_labels": labels, "min_max": min_max, "dates": ordered_dates, "outlier_indexes": outlier_indexes })
def test_preprocess_none_correlation(self): """Should shift the data to zero.""" data = [[1.883, 2.9374874, 3.927837, -1], [5.282929, -1, 4.28983738, 3.98198], [8.982978738, 5.9289227, 0, 3.938383], [1, 3.7988, 4.929278, 4.93081]] instance_labels = [[0], [0], [0], [0]] result = clustering.preprocess(np.array(data), "none", "correlation", instance_labels, "k-means") solution = [[2.883, 3.9374874, 4.927837, 0], [6.282929, 0, 5.28983738, 4.98198], [8.982978738, 5.9289227, 0, 3.938383], [0, 2.7988, 3.929278, 3.93081]] self.assertEqual(result.tolist(), solution)
def test_preprocess_one_hot_proximity(self): """Data should have the encoded labels appended.""" data = [[1.883, 2.9374874, 3.927837, -1], [5.282929, -1, 4.28983738, 3.98198], [8.982978738, 5.9289227, 0, 3.938383], [-1, 3.9998, 4.929278, 4.9389]] instance_labels = [[0, 1], [0, 1], [0, 0], [1, 0]] result = clustering.preprocess(np.array(data), "one-hot", "proximity", np.array(instance_labels), "k-means") data = [[1.883, 2.9374874, 3.927837, -1, 0, 1], [5.282929, -1, 4.28983738, 3.98198, 0, 1], [8.982978738, 5.9289227, 0, 3.938383, 0, 0], [-1, 3.9998, 4.929278, 4.9389, 1, 0]] self.assertEqual(result.tolist(), data)
def frequency(similarity, algorithm, label_encoding, chart_id): """Runs kmeans and gets the frequencies of labels per time series and labels per cluster. Args: similarity: The similarity measure used for scaling the data before clustering. Must be "proximity" or "correlation". label_encoding: The method used for encoding the labels. Must be "none" or "one-hot". chart_id: The id of the file containing the data that k-means clustering is run on. Returns: A json with a list of cluster labels generated by running kmeans, an array of labels per time series and an array of labels per cluster. """ data = load_data(chart_id) if "timeSeries" not in data: return data (time_series_data, label_dict, ts_to_labels, _, _) = clustering.time_series_array(data, None) time_series_data = clustering.preprocess(time_series_data, label_encoding, similarity, ts_to_labels, "k-means") if algorithm == "k-means": labels = clustering.kmeans(time_series_data, "off") elif algorithm == "k-means-constrained": labels = clustering.kmeans_kmedians(time_series_data, label_dict, ts_to_labels, algorithm, "off") cluster_labels = clustering.cluster_to_labels(labels, ts_to_labels) ordered_labels, ordered_clusters, ordered_ts = clustering.sort_labels( label_dict, cluster_labels, ts_to_labels) return jsonify({ "labels": ordered_labels, "ts_labels": ordered_ts.tolist(), "cluster_labels": ordered_clusters.tolist() })