def get_sorted_distances_from_cluster(cluster_number): ''' get clusters in order of least-greatest distance from a certain cluster :param cluster_number: the reference cluster :return: list of clusters ordered by distance ''' kmeans_clusters_request = get_kmeans_clusters() if kmeans_clusters_request["error"]: print(kmeans_clusters_request["error"]) return None kmeans_clusters = kmeans_clusters_request["data"] cluster = None distances = {} sorted_distances = [] for c in kmeans_clusters: if c.cluster_number == cluster_number: cluster = c if cluster == None: return {} for other_cluster in kmeans_clusters: if other_cluster.cluster_number != cluster.cluster_number: distances[ other_cluster.cluster_number] = distance_between_coordinates( cluster.get_coordinates(), other_cluster.get_coordinates()) for key, value in sorted(distances.items(), key=lambda item: item[1]): sorted_distances.append(key) return sorted_distances
def find_closest_cluster(cluster_number): ''' Find the closest cluster to a specific cluster :param cluster_number: reference cluster :return: number of closest cluster ''' cluster_request = get_kmeans_cluster(cluster_number) clusters_request = get_kmeans_clusters() closest_cluster = cluster_number min_distance = sys.maxint if cluster_request["error"]: print(cluster_request["error"]) return closest_cluster if clusters_request["error"]: print(clusters_request["error"]) return closest_cluster cluster = cluster_request["data"] clusters = clusters_request["data"] cluster_coordinates = cluster.get_coordinates() if not cluster: return closest_cluster for c in clusters: if cluster.cluster_number != c.cluster_number: distance = distance_between_coordinates(cluster_coordinates, c.get_coordinates()) if distance < min_distance: min_distance = distance closest_cluster = c.cluster_number return closest_cluster
def cluster_size_upper_threshold(): ''' get the size threshold for compatible clusters calculations :return: size threshold ''' clusters_request = get_kmeans_clusters() if clusters_request["error"]: print(clusters_request["error"]) return 0 clusters = clusters_request["data"] cluster_lengths = [len(cluster.ingredients) for cluster in clusters] kmeans_stats = find_stats(cluster_lengths) return kmeans_stats["mean"]
def largest_cluster_size(): ''' Get the largest cluster size :return: size of largest cluster ''' clusters_request = get_kmeans_clusters() max_size = 0 if clusters_request["error"]: print(clusters_request["error"]) return max_size clusters = clusters_request["data"] for cluster in clusters: if len(cluster.ingredients) < max_size: max_size = len(cluster.ingredients) return max_size
def smallest_cluster_size(): ''' Find the size of the smallest cluster :return: size of smallest cluster ''' clusters_request = get_kmeans_clusters() min_size = sys.maxint if clusters_request["error"]: print(clusters_request["error"]) return min_size clusters = clusters_request["data"] for cluster in clusters: if len(cluster.ingredients) < min_size: min_size = len(cluster.ingredients) return min_size
def create_mean_shift_clusters(): clusters_request = get_kmeans_clusters() if clusters_request["error"]: return None clusters = clusters_request["data"] keys = [] mean_shift_coordinates = [] for cluster in clusters: mean_shift_coordinates.append(cluster.get_coordinates_list()) keys.append(cluster.cluster_number) mean_shift_array = np.array(mean_shift_coordinates) bandwidth = estimate_bandwidth(mean_shift_array, quantile=0.1) clustering = MeanShift(bandwidth=bandwidth).fit(mean_shift_array) for i, key in enumerate(keys): print(str(key) + ": " + str(clustering.labels_[i]))
def kmeans_test(): ''' :return: ''' clusters_request = get_kmeans_clusters() if clusters_request["error"] != None: raise (clusters_request["error"]) clusters = clusters_request["data"] cluster_distances = get_all_kmeans_cluster_distances_dictionary() cluster_lengths = [len(cluster.ingredients) for cluster in clusters] kmeans_stats = find_stats(cluster_lengths) with open(os.getcwd() + '/app/test/kmeans_clusters.txt', 'w') as textfile: textfile.write("Size stats\n") textfile.write("-------------------------\n") textfile.write("Mean: " + str(kmeans_stats["mean"])) textfile.write("\n") textfile.write("Median: " + str(kmeans_stats["median"])) textfile.write("\n") textfile.write("St Dev: " + str(kmeans_stats["stdev"])) textfile.write("\n") textfile.write("Quartile 1: " + str(kmeans_stats["qt1"])) textfile.write("\n") textfile.write("Quartile 2: " + str(kmeans_stats["qt2"])) textfile.write("\n") textfile.write("-------------------------\n") textfile.write("\n") for x, cluster in enumerate(clusters): textfile.write("Cluster " + str(x) + "\n") textfile.write("-------------------------\n") textfile.write("Cluster size: " + str(len(cluster.ingredients)) + "\n") textfile.write(",".join(cluster.get_ingredient_strings()) + "\n") textfile.write("-------------------------\n") textfile.write("\n") textfile.write("\n") for i, entry in enumerate(cluster_distances): textfile.write("Distances from Cluster " + str(i) + "\n") textfile.write("-------------------------\n") for key, value in sorted(entry.items(), key=lambda item: item[1]): textfile.write("%s: %s \n" % (key, value)) textfile.write("-------------------------\n") textfile.write("\n")
def get_all_kmeans_cluster_distances(): ''' get the distances from each k-means cluster to the other :return: list of cluster distances with indices corresponding to cluster number ''' kmeans_clusters_request = get_kmeans_clusters() if kmeans_clusters_request["error"]: print(kmeans_clusters_request["error"]) return None kmeans_clusters = kmeans_clusters_request["data"] cluster_distances = [None] * len(kmeans_clusters) for c1 in kmeans_clusters: distances = [None] * len(kmeans_clusters) for c2 in kmeans_clusters: distances[c2.cluster_number] = distance_between_coordinates( c1.get_coordinates(), c2.get_coordinates()) cluster_distances[c1.cluster_number] = distances return cluster_distances