def plot_cop(data_items, centroids, clusters): data_items = data_items.collect() from sklearn.manifold import TSNE points_embedded = list(map(lambda x: x[1], data_items)) points_embedded = TSNE(n_components=2).fit_transform(points_embedded) data_items_indexed = [] for index, point in enumerate(points_embedded): data_items_indexed.append((index, point)) empty_clusters = no_clusters - len(centroids) for i in range(empty_clusters): centroids.append(np.zeros(shape=(1, dimension))) centroids_embedded = list(map(lambda x: x[1][0], centroids)) print("Centroids") pprint(centroids_embedded) centroids_embedded = TSNE( n_components=2).fit_transform(centroids_embedded) centroids_indexed = [] for index, centroid in enumerate(centroids_embedded): centroids_indexed.append((index, centroid)) point_to_cluster_assignment = clusters \ .zipWithIndex() \ .map(lambda x: (x[0][0], x[1])) \ .groupByKey() \ .mapValues(lambda indexes: list(indexes)) \ .collect() plot_clusters(data_items, centroids, point_to_cluster_assignment, 'Constraints Based')
def plot_kernel(data_items, centroids, clusters): data_items_indexed = data_items\ .zipWithIndex()\ .map(lambda x: (x[1], x[0]))\ .collect() centroids_indexed = centroids\ .collect() clusters_indexed = clusters\ .collect() plot_clusters(data_items_indexed, centroids_indexed, clusters_indexed, 'Kernel')
def plot_pso(data_items, centroids, clusters): data_items_indexed = data_items\ .sortByKey()\ .collect() centroids_indexed = [(centroid_idx, centroid) for centroid_idx, centroid in enumerate(centroids) ] clusters_indexed = clusters\ .zipWithIndex()\ .map(lambda x: (x[1], x[0]))\ .collect() plot_clusters(data_items_indexed, centroids_indexed, clusters_indexed, 'Particle Swarm Optimization')
def plot_default(data_items, centroids, clusters, k): # preparing data for plotting data_items_indexed = data_items\ .zipWithIndex()\ .map(lambda x: (x[1], x[0]))\ .collect() centroids_indexed = list(zip([i for i in range(k)], centroids)) clusters_indexed = clusters\ .zipWithIndex()\ .map(lambda x: (x[0][0], x[1]))\ .groupByKey()\ .map(lambda x: (x[0], list(x[1])))\ .collect() plot_clusters(data_items_indexed, centroids_indexed, clusters_indexed, 'Default PySparkless')
def plot_mahalanobis(data_items, centroids, clusters): data_items_indexed = data_items \ .zipWithIndex() \ .map(lambda x: (x[1], x[0])) \ .collect() pprint(data_items_indexed) centroids_indexed = [(index, point) for index, point in enumerate(centroids)] pprint(centroids_indexed) clusters_indexed = clusters\ .zipWithIndex()\ .map(lambda x: (x[0][0], x[1]))\ .groupByKey()\ .map(lambda x: (x[0], list(x[1])))\ .collect() pprint(clusters_indexed) plot_clusters(data_items_indexed, centroids_indexed, clusters_indexed, 'Mahalanobis Distance')