コード例 #1
0
def main(argv):
    X, K, init, movie_ids = DataProcessing.process_input(argv)

    if init == "random":
        clusters, centroids, distances = KMeans.execute(X, K)
        print(
            "Ran k-means. Start Distance={:.0f}, End Distance={:.0f}. Clusters = {}."
            .format(distances[0], distances[-1], clusters))

        Utils.write_output_csv(clusters, "output.csv", movie_ids)
    elif init == "k-means++":
        clusters, centroids, distances = KMeanspp.execute(X, K)
        print(
            "Ran k-means++. Start Distance={:.0f}, End Distance={:.0f}. Clusters = {}."
            .format(distances[0], distances[-1], clusters))
        Utils.write_output_csv(clusters, "output.csv", movie_ids)
    elif init == "1d":
        X = PCAHelper.pca_helper(X, 1)
        X.astype(np.float16)
        distances_by_k, cluster, centroids = OneDKmeans(
            X, K).run()  # KMeans.execute(X, k)
        print("Ran 1d K-means. Distance={}".format(distances_by_k[-1]))
        # plot_data_opt_k3(distances_by_k, list(range(1,K+1)))
        Utils.write_output_csv(cluster, "output.csv", movie_ids)
    else:
        assert Error("init parameter was not inputted correctly!")
コード例 #2
0
    def main():
        if len(sys.argv) < 3:
            assert Error("need input argument.")

        _, csv_path, K = sys.argv
        K = int(K)
        X = pd.read_csv(csv_path).values
        X = PCAHelper.parse_data(X)  # Steps 1-5

        # k-means and k-means++ execution
        km_clusters, km_centroids, km_distances = KMeans.execute(X, K)
        kmpp_clusters, kmpp_centroids, kmpp_distances = KMeanspp.execute(X, K)
        print("km dist={}, kmpp dist={}".format(km_distances[-1],
                                                kmpp_distances[-1]))

        # pca
        X = PCAHelper.pca_helper(X, 2)

        # plot
        Utils.plot_data2(X,
                         K,
                         km_clusters,
                         title="K-means clustering with PCA",
                         xaxis="First Principal Component",
                         yaxis="Second Principal Component")
        Utils.plot_data2(X,
                         K,
                         kmpp_clusters,
                         title="K-means++ clustering with PCA",
                         xaxis="First Principal Component",
                         yaxis="Second Principal Component")
コード例 #3
0
def find_optimum_k():
    print("Start find_optimum_k")

    iter_per_K = 5
    start_range, end_range, step = 1, 40, 2
    km_distance_by_K, kmpp_distance_by_K = np.array([]), np.array([])

    # Process Input.
    X = process_input(sys.argv)

    # Try K=1,3,5 ....  29
    K_range = range(start_range, end_range, step)
    print("Will try K = ", list(K_range))
    for K in K_range:
        print("Trying K = ", K)
        # Run k-means and k-means++ 10 times. Take the average of distance. Append to distance_by_K.
        km_dist, kmpp_dist = np.array([]), np.array([])
        for _ in range(iter_per_K):
            km_clusters, km_centroids, km_distances = KMeans.execute(X, K)
            km_clusters, km_centroids, kmpp_distances = KMeanspp.execute(X, K)
            km_dist = np.append(km_dist, km_distances[-1])
            kmpp_dist = np.append(kmpp_dist, kmpp_distances[-1])

        km_ave_dist, kmpp_ave_dist = np.mean(km_dist), np.mean(kmpp_dist)
        print("Average Distance | K={} | km={}, kmpp = {}".format(
            K, km_ave_dist, kmpp_ave_dist))
        km_distance_by_K, kmpp_distance_by_K = np.append(
            km_distance_by_K, km_ave_dist), np.append(kmpp_distance_by_K,
                                                      kmpp_ave_dist)

    print("Plotting Graph")
    # 2. Plot graph for K for km_distance_by_K, kmpp_distance_by_K to pick best K.
    plot_data_opt_k(km_distance_by_K, kmpp_distance_by_K, list(K_range))
    # plot_data_opt_k2(km_distance_by_K, list(K_range), "K-means")
    # plot_data_opt_k2(kmpp_distance_by_K, list(K_range), "K-means++")

    print("Results:")
    print("K range")
    print(list(K_range))
    print("K-Means Distances:")
    print(km_distance_by_K)
    print("K-Means++ Distances:")
    print(kmpp_distance_by_K)