Example #1
0
    def plot_cop(data_items, centroids, clusters):
        data_items = data_items.collect()

        from sklearn.manifold import TSNE
        points_embedded = list(map(lambda x: x[1], data_items))
        points_embedded = TSNE(n_components=2).fit_transform(points_embedded)
        data_items_indexed = []
        for index, point in enumerate(points_embedded):
            data_items_indexed.append((index, point))

        empty_clusters = no_clusters - len(centroids)
        for i in range(empty_clusters):
            centroids.append(np.zeros(shape=(1, dimension)))

        centroids_embedded = list(map(lambda x: x[1][0], centroids))
        print("Centroids")
        pprint(centroids_embedded)
        centroids_embedded = TSNE(
            n_components=2).fit_transform(centroids_embedded)
        centroids_indexed = []
        for index, centroid in enumerate(centroids_embedded):
            centroids_indexed.append((index, centroid))

        point_to_cluster_assignment = clusters \
            .zipWithIndex() \
            .map(lambda x: (x[0][0], x[1])) \
            .groupByKey() \
            .mapValues(lambda indexes: list(indexes)) \
            .collect()

        plot_clusters(data_items, centroids, point_to_cluster_assignment,
                      'Constraints Based')
Example #2
0
    def plot_kernel(data_items, centroids, clusters):
        data_items_indexed = data_items\
            .zipWithIndex()\
            .map(lambda x: (x[1], x[0]))\
            .collect()

        centroids_indexed = centroids\
            .collect()

        clusters_indexed = clusters\
            .collect()

        plot_clusters(data_items_indexed, centroids_indexed, clusters_indexed,
                      'Kernel')
Example #3
0
    def plot_pso(data_items, centroids, clusters):
        data_items_indexed = data_items\
            .sortByKey()\
            .collect()

        centroids_indexed = [(centroid_idx, centroid)
                             for centroid_idx, centroid in enumerate(centroids)
                             ]

        clusters_indexed = clusters\
            .zipWithIndex()\
            .map(lambda x: (x[1], x[0]))\
            .collect()

        plot_clusters(data_items_indexed, centroids_indexed, clusters_indexed,
                      'Particle Swarm Optimization')
Example #4
0
    def plot_default(data_items, centroids, clusters, k):
        # preparing data for plotting
        data_items_indexed = data_items\
            .zipWithIndex()\
            .map(lambda x: (x[1], x[0]))\
            .collect()

        centroids_indexed = list(zip([i for i in range(k)], centroids))

        clusters_indexed = clusters\
            .zipWithIndex()\
            .map(lambda x: (x[0][0], x[1]))\
            .groupByKey()\
            .map(lambda x: (x[0], list(x[1])))\
            .collect()

        plot_clusters(data_items_indexed, centroids_indexed, clusters_indexed,
                      'Default PySparkless')
    def plot_mahalanobis(data_items, centroids, clusters):
        data_items_indexed = data_items \
            .zipWithIndex() \
            .map(lambda x: (x[1], x[0])) \
            .collect()
        pprint(data_items_indexed)

        centroids_indexed = [(index, point)
                             for index, point in enumerate(centroids)]
        pprint(centroids_indexed)

        clusters_indexed = clusters\
            .zipWithIndex()\
            .map(lambda x: (x[0][0], x[1]))\
            .groupByKey()\
            .map(lambda x: (x[0], list(x[1])))\
            .collect()
        pprint(clusters_indexed)

        plot_clusters(data_items_indexed, centroids_indexed, clusters_indexed,
                      'Mahalanobis Distance')