コード例 #1
0
ファイル: q_15_16.py プロジェクト: openopentw/2019-ML
def main():
    """ Main function. """
    # parse args
    parser = argparse.ArgumentParser()
    parser.add_argument('data', help='hw4_nolabel_train.dat')
    parser.add_argument('-t',
                        '--trial',
                        type=int,
                        default=500,
                        help='experiment times (default = 500)')
    parser.add_argument(
        '-o',
        '--output_to_png',
        default=False,
        action='store_true',
        help='Output image to files. (default is display on screen)')
    args = parser.parse_args()

    # get data
    data = get_data(args.data)

    # fit
    k_list = [2, 4, 6, 8, 10]
    avg_list = []
    var_list = []
    for k in k_list:
        err_list = []
        k_means = KMeans(k)
        for _ in range(args.trial):
            k_means.fit(data)
            err_list.append(k_means.calc_err())
        err_list = np.array(err_list)
        avg_list.append(err_list.mean())
        var_list.append(err_list.var())

    # plot
    plt.scatter(k_list, avg_list)
    plt.title('Average of $E_{in}$ vs. $k$')
    plt.xlabel('$k$')
    plt.ylabel('Average of $E_{in}$')
    if args.output_to_png:
        plt.savefig('q_15')
    else:
        plt.show()
    plt.clf()

    # plot
    plt.scatter(k_list, var_list)
    plt.title('Variance of $E_{in}$ vs. $k$')
    plt.xlabel('$k$')
    plt.ylabel('Variance of $E_{in}$')
    if args.output_to_png:
        plt.savefig('q_16')
    else:
        plt.show()
    plt.clf()
コード例 #2
0
 def _initialize_params(self, data):
     km = KMeans(self.k)
     km.fit(data)
     self.dim = data.shape[-1]
     _, self.means = km.predict(data)
     self.means = np.unique(self.means, axis=0)
     self.pis = np.random.uniform(0, 1, (self.k, ))
     self.pis = self.pis / np.sum(self.pis)
     self.covariances = np.array([np.eye(self.dim)] * self.k) * 100000000
     self.gammas = np.zeros((data.shape[0], self.k))
コード例 #3
0
def main(input_filepath, output_folder, k):
    """
    Receives the location of the tf-idf scores as a
    command-line Path argument.
    """
    logger = logging.getLogger(__name__)
    logger.info(
        'Training the K-Means clustering algorithm based on the TF-IDF scores')

    # Get the models/tf-idf-scores.csv file
    dataset = pd.read_csv(input_filepath)
    logger.info('Loaded data file ' + input_filepath + ' with ' +
                str(len(dataset)) + ' rows')

    # Removes the first column and formats it like a list
    x = dataset.drop(dataset.columns[0], axis=1).values
    vector_dict = generate_vector_dict(dataset)

    # Number of clusters and max. number of iterations
    km = KMeans(k=k, max_iterations=500)
    km.fit(x)
    clusters = km.get_clusters(vector_dict)

    # Based on the value of K used, change the destination filename
    filepath_list = (output_folder + MODEL_REPORT_FILENAME).rsplit('.', 1)
    output_filepath = filepath_list[0] + '-' + str(k) + '.' + filepath_list[1]

    # Calculate SSE and MSC
    sse_score = km.get_sse_score()
    logger.info('SSE Score: ' + str(sse_score))
    msc_score = km.get_msc_avg()
    logger.info('MSC Score: ' + str(msc_score))

    # Generate the results report
    generate_report(clusters, sse_score, msc_score, output_filepath)
    logger.info('Created report file on ' + output_filepath)

    # Generate / Update the results table for future plots
    if os.path.isfile(output_folder + PLOT_TABLE_FILENAME):
        # Update the existing file
        dataset = pd.read_csv(output_folder + PLOT_TABLE_FILENAME)
        dataset.set_index('K Size', inplace=True)
        k_means_results = update_plot_results_table(dataset,
                                                    (k, sse_score, msc_score))
    else:
        # Create and update the file
        dataset = create_plot_results_table()
        k_means_results = update_plot_results_table(dataset,
                                                    (k, sse_score, msc_score))
    k_means_results.to_csv(output_folder + PLOT_TABLE_FILENAME,
                           encoding='utf-8')
    logger.info('Updated report table on ' + output_folder +
                PLOT_TABLE_FILENAME)
コード例 #4
0
ファイル: driver.py プロジェクト: av9ash/kmeanswithpca
def main():
    data = load_data()
    results = []
    np.random.seed(10)

    # pca_data = pca.pca(data, 2)[0]    #pca from scratch
    # pca_data = pca.pca_s(data, 2)     #pca from sk_learn library

    # code for simple run where k=2
    # k=2
    # random_centroids = np.random.randint(0, 128, k)
    # km = KMeans(k)
    # km.fit(data, random_centroids)

    for k in range(2, 11):
        random_centroids = np.random.randint(0, 128, k)
        km = KMeans(k)
        results.append(km.fit(data,
                              random_centroids))  #comment this for without pca
        # results.append(km.fit(pca_data, random_centroids))    #comment this for with pca
    plt.plot(results, list(range(2, 11)))
    # plt.show()
    plt.savefig('k_means.png')
コード例 #5
0
    def fit(self, csr):
        """Apply bisecting k-means"""

        # initialize k-means with k=2 for bisection
        kmeans = KMeans(k=2, pct_change=self.k_means_pct_change,
                        max_iter=self.k_means_max_iter)

        # initialize list of clusters with all points
        clusters = [range(0, csr.shape[0])]

        while len(clusters) < self.k:
            cluster = self.select_next_cluster(clusters)

            # bisect cluster iter times and select both clusters from split with lowest SSE
            lowest_sse = None
            best_split = None
            for i in range(self.n_iters):
                print 'Bisecting run # %d/%d, iter # %d/%d' % (len(clusters)+1,
                                                               self.k-1, i+1,
                                                               self.n_iters)

                # split cluster in two using k-means of 2
                bisection = kmeans.fit(csr, cluster)
                split = lambda data, l: [cluster[j] for j, d in enumerate(data) if d == l]
                x, y = split(bisection, 0), split(bisection, 1)

                # calculate total SSE of both clusters and store if lowest so far
                sse_total = self.sse(csr[x, :]) + self.sse(csr[y, :])
                if sse_total < lowest_sse or lowest_sse is None:
                    lowest_sse = sse_total
                    best_split = (x, y)

            # add best cluster split to list
            clusters.extend(best_split)

        return self.label_clusters(csr, clusters)
コード例 #6
0
ファイル: p1_k_means_cluster.py プロジェクト: wangyanhit/AI
import numpy as np
import matplotlib.pyplot as plt
from k_means import KMeans

k_means = KMeans(2)
X = np.loadtxt("realdata.txt")[:, 1:]
k_means.fit(X)
labels = k_means.labels_

plt.xlabel('Length')
plt.ylabel('Width')
handles = []
s1 = plt.scatter(X[labels == 0, 0],
                 X[labels == 0, 1],
                 color='r',
                 label="Cluter1",
                 marker='o')
handles.append(s1)
s2 = plt.scatter(X[labels == 1, 0],
                 X[labels == 1, 1],
                 color='k',
                 label="Cluter2",
                 marker='^')
handles.append(s2)

plt.legend(handles=handles)
plt.title('K-means')
plt.show()
コード例 #7
0
ファイル: test.py プロジェクト: SalahBelila/naive_k-means
import numpy as np
from k_means import KMeans
from distance import euclidean
from mean import mean
import pickle

DATA_PATH = 'D:\datasets\mnist\large_dataset\mnist_train.csv'

print('Loading Data')
f = open(DATA_PATH, 'r')
data_list = []
for line in f.readlines():
    observation = np.asfarray(line.split(',')[1:])
    data_list.append(observation / 255)
f.close()
print('Finished Loading')

print('Fitting Started')
model = KMeans()
clusters = model.fit(data_list, 10, euclidean, mean)
print('Fitting Finished')

print('Saving Clusters to ./clusters.pkl')
f = open('./clusters.pkl', 'wb')
pickle.dump(clusters, f, protocol=pickle.HIGHEST_PROTOCOL)
f.close()
コード例 #8
0
        print(toGreen("Done"), "\nThe data shape: {}".format(data.shape))
        if args.save_name is not None:
            save_path = "../data/" + args.save_name + '.npy'
            np.save(save_path, data)
            print("The generated data has been stored in {}".format(save_path))
    else:
        print(toRed("2. Loading saved data..."))
        file_path = "../data/" + args.use_saved_data + '.npy'
        data = np.load(file_path)
        print("Done. The data shape: {}".format(data.shape))

    # Process data using the chosen algorithm(s)
    print(toRed("3. Starting clustering..."))
    if args.mode == "KM":
        kmeans = KMeans(args.cluster_num, data.shape[0], max_iter=100)
        labels, centers, it = kmeans.fit(data)
        print(toGreen('Done'), '\nPredicted labels:\n', labels, '\nPredicted cluster center points:\n', centers)
        print(toRed("4. Evaluating..."))
        score = evaluate(data, labels)
        print("score:{}".format(score))
        if args.visualize == True:
            print(toRed("5. Visualizing..."))
            print(toGreen("Done. Check the figure on screen, close it to exit the program."))
            visualize_single(args.cluster_num, data, labels, centers)
    elif args.mode == "KMPP":
        kmeansplus = KMeansPlus(args.cluster_num, data.shape[0], max_iter=100)
        labels, centers = kmeansplus.fit(data)
        print(toGreen('Done'), '\nPredicted labels:\n', labels, '\nPredicted cluster center points:\n', centers)
        print(toRed("4. Evaluating..."))
        score = evaluate(data, labels)
        print("score:{}".format(score))
コード例 #9
0
    fig = plt.figure()
    # make 3D axis
    ax = fig.add_subplot(111, projection='3d')

    # scatter plots
    ax.scatter(images_proj[:, 0],
               images_proj[:, 1],
               images_proj[:, 2],
               zdir='z',
               s=10,
               c="blue",
               depthshade=True)

    # cluster and plot
    n_clusters = 345
    k_means = KMeans(n_clusters)
    error = k_means.fit(images_proj)
    labels = k_means.classify_centroids(images_proj, labels[:10000])
    centroids = k_means.centroids
    print(centroids)
    ax.scatter(centroids[:, 0],
               centroids[:, 1],
               centroids[:, 2],
               zdir='z',
               s=500,
               c="red",
               depthshade=True)

    plt.show()