Exemple #1
0
def km(tx, ty, rx, ry, add="", times=5):
    #this does the exact same thing as the above
    errs = []

    checker = KM(n_clusters=2)
    checker.fit(ry)
    truth = checker.predict(ry)

    # so we do this a bunch of times
    for i in range(2,times):
        clusters = {x:[] for x in range(i)}
        clf = KM(n_clusters=i)
        clf.fit(tx)  #fit it to our data
        test = clf.predict(tx)
        result = clf.predict(rx)  # and test it on the testing set
        for index, val in enumerate(result):
            clusters[val].append(index)
        mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)}
        processed = [mapper[val] for val in result]
        errs.append(sum((processed-truth)**2) / float(len(ry)))
    plot([0, times, min(errs)-.1, max(errs)+.1],[range(2, times), errs, "ro"], "Number of Clusters", "Error Rate", "KMeans clustering error", "KM"+add)

    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    newtx = np.append(tx, td, 1)
    newrx = np.append(rx, rd, 1)
    nn(newtx, ty, newrx, ry, add="onKM"+add)
Exemple #2
0
 def KMeansRatio(self):
     '''
     Type: K-Means
     Y-axis: No Reaction
     X-axis: Reaction
     '''
     if self.authenticated:
         from sklearn.cluster import KMeans as KM
         algorithm = KM(n_clusters=2)
         categories = algorithm.fit_predict(self.allCoord)
         plt.scatter(self.allCoord[categories == 0, 0],
                     self.allCoord[categories == 0, 1],
                     c="green")
         plt.scatter(self.allCoord[categories == 1, 0],
                     self.allCoord[categories == 1, 1],
                     c="red")
         plt.scatter(algorithm.cluster_centers_[:, 0],
                     algorithm.cluster_centers_[:, 1],
                     c="black",
                     marker="*")
         for i, txt in enumerate(self.labels):
             plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1]))
         plt.ylabel("NO REACTION")
         plt.xlabel("REACTION")
         plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0])
         plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1])
         plt.title("K-Means: Reaction, No Reaction")
         plt.show()
Exemple #3
0
 def KMeansPartPercent(self):
     '''
     Type: K-Means
     Y-axis: % Reactions
     X-axis: # Reactions
     '''
     if self.authenticated:
         from sklearn.cluster import KMeans as KM
         algorithm = KM(n_clusters=2)
         # partPercent = np.array([np.array([x, percent]) for j in self.stuff for _, x, _, percent in j])
         categories = algorithm.fit_predict(self.partPercent)
         plt.scatter(self.partPercent[categories == 0, 0],
                     self.partPercent[categories == 0, 1],
                     c="green")
         plt.scatter(self.partPercent[categories == 1, 0],
                     self.partPercent[categories == 1, 1],
                     c="red")
         plt.scatter(algorithm.cluster_centers_[:, 0],
                     algorithm.cluster_centers_[:, 1],
                     c="black",
                     marker="*")
         for i, txt in enumerate(self.labels):
             plt.annotate(txt,
                          (self.partPercent[i][0], self.partPercent[i][1]))
         plt.ylabel("PERCENT")
         plt.xlabel("NUM OF INFLAMS")
         plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0])
         plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1])
         plt.title("K-Means: # Reactions, % Reactions")
         plt.show()
def extractColors(stream, maxColor):
    pixData = np.array(stream)
    h, w, d = pixData.shape
    data = np.reshape(pixData, (h * w, d))
    km = KM(n_clusters=maxColor)
    km.fit(data)
    theme = np.array(km.cluster_centers_, dtype=np.uint8).tolist()
    return getrgbAndHex(theme)
Exemple #5
0
    def cluster(self, inputs):
        t = time()
        helper._print('Training clusters (KMeans)...')
        kmeans = KM(n_clusters=self.num_clusters, init=self.cluster_init, max_iter=1000, tol=0.000001)
        cluster_pred = kmeans.fit_predict(inputs)
        helper._print(f'Done training clusters. Finished in {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds!')

        return cluster_pred
Exemple #6
0
 def __init__(self, data, k, t, iter, maxE):
     #if t == 0:
     #   t = 'k-means++'
     #else:
     #   t = 'random'
     self.kmean = KM(k, t, iter, maxE).fit(np.array(data))
     self.labels = self.kmean.labels_
     self.clusters = self.kmean.cluster_centers_
Exemple #7
0
 def __init__(self, pixData, maxColor, useSklearn=True):
     super(KMeans, self).__init__()
     h, w, d = pixData.shape
     self.pixData = np.reshape(pixData, (h * w, d))
     self.maxColor = maxColor
     if useSklearn:
         self._KMeans = KM(n_clusters=maxColor)
     else:
         self._KMeans = KMDiy(n_clusters=maxColor)
Exemple #8
0
    def do_KM(self, letter, k, iter=200, dump=True, parr=-2):
        if type(letter) == str:
            l, v = self.find_by_start(letter)
            km_obj = KM(n_clusters=k, max_iter=iter, n_jobs=parr)
            results = km_obj.fit(v)
            if dump == True:
                filename = DT.now().strftime(
                    '%d%m%y-%H%M%S') + self.generate_filename() + '-k' + str(k)
                self.dump(l, results.labels_, filename)
                self.km_labels = l
                self.km_clusters = results.labels_
                print('Stored KM output in self.km_labels, self.km_clusters')
            else:
                self.km_labels = l
                self.km_clusters = results.labels_
                print('Stored KM output in self.km_labels, self.km_clusters')

        if type(letter) == list:
            l, v = self.find_by_list(letter)
            km_obj = KM(n_clusters=k, max_iter=iter, n_jobs=parr)
            results = km_obj.fit(v)

            self.predicted_clusters = {}
            for v, k in zip(l, results.labels_):
                self.predicted_clusters[v] = k
            self.pred_prep = sorted(self.predicted_clusters.items(),
                                    key=itemgetter(0))
            self.predicted = [i[1] for i in self.pred_prep]

            if dump == True:
                filename = DT.now().strftime(
                    '%d%m%y-%H%M%S') + self.generate_filename() + '-k' + str(k)
                self.dump(l, results.labels_, filename)
                self.km_labels = l
                self.km_clusters = results.labels_
                print('Stored KM output in {self}.km_labels, self.km_clusters')
            else:
                self.km_labels = l
                self.km_clusters = results.labels_
                print('Stored KM output in self.km_labels, self.km_clusters')
Exemple #9
0
 def set_bin_ranges_for_property(
         self, property_to_entities: Dict[int, Set[Entity]],
         class_to_entities: Dict[int, Set[Entity]],
         property_to_timestamps: Dict[int,
                                      List[TimeStamp]], property_id: int):
     if not property_to_timestamps:
         self.load_property_to_timestamps(property_to_timestamps,
                                          property_id)
     property_values = np.array([
         ts.value for ts in property_to_timestamps[property_id]
     ]).reshape(-1, 1)
     kmeans = KM(n_clusters=self.bin_count - 1).fit(property_values)
     return list(
         sorted([centroid[0] for centroid in kmeans.cluster_centers_]))
Exemple #10
0
def findClusterCenters(data):
    # prepare data
    nonzero_xy = np.nonzero(data)
    if len(nonzero_xy) != 2:
        return
    if nonzero_xy[0].size < 8:
        return
        
    processed_data = np.vstack((nonzero_xy[0], nonzero_xy[1])).transpose()
    
    # K-Means algorithm
    kmeans = KM(n_clusters = 8, max_iter=20, n_init = 5)
    kmeans.fit(processed_data)
    center_points = np.fliplr(kmeans.cluster_centers_)
    return center_points
 def KMeans(self):
     return1 = self.printPoints()  
     if not return1:
         return
     algorithm = KM(n_clusters=2)
     categories = algorithm.fit_predict(self.allCoord)
     print(self.allCoord)
     print(categories)
     plt.scatter(self.allCoord[categories == 0, 0], self.allCoord[categories == 0, 1], c= "green")
     plt.scatter(self.allCoord[categories == 1, 0],self.allCoord[categories == 1, 1], c="red")
     plt.scatter(algorithm.cluster_centers_[:, 0], algorithm.cluster_centers_[:, 1], c= "black", marker="*")
     print(len(self.labels), len(self.allCoord))
     for i, txt in enumerate(self.labels):
         plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1]))
     plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0])
     plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1])
     plt.savefig("static/" + NAME)
     self.src = NAME
Exemple #12
0
    def KMeansPercentTotal(self):
        '''
        Type: K-Means
        Y-axis: % Reactions
        X-axis: # Observations
        '''
        if self.authenticated:
            from sklearn.cluster import KMeans as KM
            algorithm = KM(n_clusters=2)
            fig = plt.figure()
            # partPercent = np.array([np.array([x, percent]) for j in self.stuff for _, x, _, percent in j])
            categories = algorithm.fit_predict(self.percentTotal)
            plt.scatter(self.percentTotal[categories == 0, 0],
                        self.percentTotal[categories == 0, 1],
                        c="green")
            plt.scatter(self.percentTotal[categories == 1, 0],
                        self.percentTotal[categories == 1, 1],
                        c="red")
            plt.scatter(algorithm.cluster_centers_[:, 0],
                        algorithm.cluster_centers_[:, 1],
                        c="black",
                        marker="*")
            for i, txt in enumerate(self.labels):
                plt.annotate(
                    txt, (self.percentTotal[i][0], self.percentTotal[i][1]))
            plt.ylabel("PERCENT")
            plt.xlabel("TOTAL")
            plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0])
            plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1])
            plt.title("K-Means: # Observations, % Reactions")
            # plt.show()
            # mpld3.show()
            # plt.savefig()

            tmpfile = BytesIO()
            # plt.savefig('test.png')
            fig.savefig(tmpfile, format='png')
            encoded = base64.b64encode(tmpfile.getvalue())

            html = '<img src=\'data:image/png;base64,{}\'>'.format(
                encoded.decode("utf-8"))

            with open('KMeansPercentTotal.html', 'w') as f:
                f.write(html)
Exemple #13
0
def ttsf(ID, CONTENTS):
    length = len(ID)
    wei = []
    for content in CONTENTS:
        cut = jieba.cut(content, cut_all=False)
        words = (' '.join(cut))
        wei.append(words)
    vector = TfidfVectorizer()
    tfidf = vector.fit_transform(wei)
    d = tfidf.toarray()
    k = 3
    clf = KM(k)
    hehe = clf.fit_predict(d)
    results = []
    for i in range(length):
        if hehe[i] == hehe[0] and i != 0:
            results.append(ID[i])
        else:
            None
    print(results)
    return results
Exemple #14
0
def passl_local_graph_partial(site, loc_param_indices, params):
    X = site.buff[loc_param_indices[0]]
    K, rbf_sigma, local_graph_index, n_cluster, centers_index, point_cluster_index, inter_graph_index, member_id_index = params
    nins = NN(K + 1, None, metric='euclidean').fit(X)
    W = nins.kneighbors_graph(nins._fit_X, K + 1, mode='distance')
    #W.data=W.data**2
    W.data = np.exp(-W.data**2 / rbf_sigma)
    W[np.diag_indices(W.shape[0])] = 0
    #W[np.diag_indices(W.shape[0])]=0
    site.buff[local_graph_index] = W
    kins = KM(n_cluster)
    point_cluster = kins.fit_predict(X)
    site.buff[point_cluster_index] = point_cluster
    site.buff[centers_index] = kins.cluster_centers_
    #print(kins.cluster_centers_)
    site.buff[inter_graph_index] = {}
    member_id = []
    for i in range(n_cluster):
        member_id.append(np.where(point_cluster == i)[0])
        #print(member_id[-1])
    site.buff[member_id_index] = member_id
    def cluster_list_k(self, k_list, data):
        data = sp.vstack(data, format='csr')
        # self.data = pd.DataFrame.sparse.from_spmatrix(v)
        # data = pd.DataFrame(sp.vstack(data, format='csr').toarray())
        # data = np.array(data)

        self.k_index = 0
        self.labels_list = []
        self.RSS_list = []
        self.centroids_list = []
        ks = []
        for k in k_list:
            # print('\n')
            print('\n' + str(k) + ":", end=" ")
            # # try:
            # labels, RSS, cents = KMeans.cluster_with_k(k, data)
            # self.RSS_list.append(RSS)
            # self.labels_list.append(np.array(labels))
            # self.centroids_list.append(cents)
            # ks.append(k)
            # # except:
            # #     print('Failed', end='')
            # try:
            sk = KM(k, n_init=1, max_iter=30).fit(data)
            self.labels_list.append(np.array(sk.labels_))
            self.centroids_list.append(sk.cluster_centers_)
            self.RSS_list.append(sk.inertia_)
            ks.append(k)
            # except:
            #     print('Failed', end='')

        plt.plot(ks, self.RSS_list)
        plt.savefig('./cluster_rss')
        plt.show()
        np.save("index" + file_post, [self.k_index])
        np.save("label" + file_post, self.labels_list)
        np.save("rss" + file_post, self.RSS_list)
        np.save("cent" + file_post, self.centroids_list)
Exemple #16
0
def em(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""):
    clf = EM(n_components=times)
    clf.fit(reduced_data)
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # centroids = clf.cluster_centers_
    # plt.scatter(centroids[:, 0], centroids[:, 1],
    #             marker='x', s=169, linewidths=3,
    #             color='w', zorder=10)
    plt.title(dataset + ': EM clustering (' + alg + '-reduced data)')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()

    clf = EM(n_components=times)
    clf.fit(tx)  #fit it to our data
    test = clf.predict(tx)
    result = clf.predict(rx)
    checker = EM(n_components=times)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    # newtx = np.append(td)
    # newrx = np.append(rd)
    myNN(test, ty, result, ry, alg="EM_"+alg)
    errs = []
    scores = []
    # this is what we will compare to
    checker = EM(n_components=2)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    adj_rand = []
    v_meas = []
    mutual_info = []
    adj_mutual_info = []
    # so we do this a bunch of times
    for i in range(2,times):
        clusters = {x:[] for x in range(i)}
        # create a clusterer
        clf = EM(n_components=i)
        clf.fit(tx)  #fit it to our data
        test = clf.predict(tx)
        result = clf.predict(rx)  # and test it on the testing set
        for index, val in enumerate(result):
            clusters[val].append(index)
        mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)}
        processed = [mapper[val] for val in result]
        errs.append(sum((processed-truth)**2) / float(len(ry)))
        scores.append(clf.score(tx, ty))
        adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result))
        v_meas.append(metrics.v_measure_score(ry.ravel(), result))
        mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result))
        adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result))
    # plot([0, times, min(scores)-.1, max(scores)+.1],[range(2, times), scores, "-"], "Number of Clusters", "Log Likelihood", dataset+": EM Log Likelihood - " + alg, dataset+"_EM_"+alg)

    # other metrics
    # names = ["Adjusted Random", "V Measure", "Mutual Info", "Adjusted Mutual Info"]
    plt.figure()
    plt.title(dataset+": EM Clustering measures - "+alg)
    plt.xlabel('Number of clusters')
    plt.ylabel('Score value')
    plt.plot(range(2,times),adj_rand, label="Adjusted Random")
    plt.plot(range(2,times),v_meas, label="V Measure")
    plt.plot(range(2,times),mutual_info, label = "Fowlkes Mallows Score")
    plt.plot(range(2,times),adj_mutual_info, label="Homogeneity Score")
    plt.legend()
    plt.savefig("EMMetrics"+dataset+"_"+alg+".png")

    kmeans = KM(n_clusters=2)
    kmeans.fit(reduced_data)

    Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

    Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title(dataset + ': EM clustering (' + alg + '-reduced data)\n'
              'Centroids are marked with a white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()
Exemple #17
0
                                      categories=categories,
                                      shuffle=True,
                                      random_state=None,
                                      remove=('headers', 'footers'))
fp = open('Result/metrics.txt', 'w+')

# Question 1
data = dataset.data
target = dataset.target
data_matrix = tf_idf_matrix(data)
fp.write('tf_idf_matrix shape: {}\n'.format(data_matrix.shape))

# Question 2
target = [1 if i > 3 else 0 for i in target]

cluster = KM(n_clusters=2, random_state=0, max_iter=1000, n_init=30)
cluster.fit(data_matrix)
predict_target = cluster.labels_
fp.write("Contingency table: \n{}\n".format(
    contingency_matrix(target, predict_target)))

# Question 3
metrics_dict = {
    "homogeneity_score": homogeneity_score,
    'completeness_score': completeness_score,
    "v_measure_score": v_measure_score,
    "adjusted_rand_score": adjusted_rand_score,
    "adjusted_mutual_info_score": adjusted_mutual_info_score
}

for metrics_name in metrics_dict.keys():
Exemple #18
0
# SCALE THE DATA
scaler = SS()
scaler.fit(df)
scaled_data = scaler.transform(df)

# PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

# K MEANS - MAKES A LOT OF SENSE TO APPLY THIS ON TOP OF PCA

from sklearn.cluster import KMeans as KM

model = KM(n_clusters=2)
model.fit(df)
fig, axes = plt.subplots(1, 2, figsize=(10, 6))
fig.suptitle('Breast Cancer', fontsize=20)

axes[0].set_title('Diagnosis')
axes[0].scatter(
    x_pca[:, 0],
    x_pca[:, 1],
    c=model.labels_,
    cmap='coolwarm',
)
axes[1].set_title('KMC')
axes[1].scatter(x_pca[:, 0], x_pca[:, 1], c=cancer['target'], cmap='coolwarm')

plt.show()
Exemple #19
0
def km(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""):
    processed = []
    adj_rand = []
    v_meas = []
    mutual_info = []
    adj_mutual_info = []
    sil = []
    inertia = []
    for i in range(2,times):
        clusters = {x:[] for x in range(i)}
        clf = KM(n_clusters=i)
        clf.fit(tx)
        test = clf.predict(tx)
        result = clf.predict(rx)

        adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result))
        v_meas.append(metrics.v_measure_score(ry.ravel(), result))
        mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result))
        adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result))
        inertia.append(clf.inertia_)
    plots = [adj_rand, v_meas, mutual_info, adj_mutual_info]
    plt.title(dataset+": KM Clustering measures - "+alg)
    plt.xlabel('Number of clusters')
    plt.ylabel('Score value')
    plt.plot(range(2,times), adj_rand, label="Adjusted Random")
    plt.plot(range(2,times), v_meas, label="V Measure")
    plt.plot(range(2,times), mutual_info, label = "Fowlkes Mallows Score")
    plt.plot(range(2,times), adj_mutual_info, label="Homogeneity Score")
    plt.legend()
    plt.ylim(ymin=-0.05, ymax=1.05)
    plt.savefig("KMeansMetric"+dataset+"_"+alg+".png")

    plt.figure()
    plt.title(dataset+": KMeans Inertia - "+alg)
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.plot(range(2,times), inertia)
    plt.savefig("KM-Inertia-"+dataset+"-"+alg+".png")

    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    newtx = np.append(tx, td, 1)
    newrx = np.append(rx, rd, 1)

    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
        # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    best_clusterer = KM(n_clusters=4)
    best_clusterer.fit(X)
    Z = best_clusterer.predict(X)
    print(len(Z))
    print(len(X))
    plt.figure(1)
    plt.clf()
    colors = ['r', 'g', 'b', 'y', 'c', 'm','#eeefff', '#317c15', '#4479b4', '#6b2b9c',
'#63133b', '#6c0d22', '#0c7c8c', '#67c50e','#c5670e', '#946c47', '#58902a', '#54b4e4',
'#e4549e', '#2b2e85'  ]
    for i in range(0, len(X)):
        plt.plot(X[i][0], X[i][1], marker='.', color=colors[Z[i]], markersize=2)
    #plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = best_clusterer.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='k', zorder=10)
    plt.title('K-means Clusters ' + alg)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()
    kmeans = KM(n_clusters=3)
    kmeans.fit(tx)
    result=pd.DataFrame(kmeans.transform(tx), columns=['KM%i' % i for i in range(3)])
    my_color = pd.Series(ty).astype('category').cat.codes
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(result['KM0'], result['KM1'], result['KM2'], c=my_color, cmap="Dark2_r", s=60)
    plt.show()
    reduced_data = PCA(n_components=2).fit_transform(tx)
    kmeans = KM(n_clusters=4)
    kmeans.fit(reduced_data)
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title(dataset + ': K-means clustering (' + alg + '-reduced data)\n'
              'Centroids are marked with a white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()

    checker = KM(n_clusters=2)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    clusters = {x:[] for x in range(4)}
    clf = KM(n_clusters=4)
    clf.fit(tx)  #fit it to our data
    test = clf.predict(tx)
    result = clf.predict(rx)  # and test it on the testing set
    for index, val in enumerate(result):
        clusters[val].append(index)
    mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(4)}
    processed = [mapper[val] for val in result]
    print(sum((processed-truth)**2) / float(len(ry)))
    clf = KM(n_clusters=times)
    clf.fit(tx)  #fit it to our data
    test = clf.predict(tx)
    result = clf.predict(rx)
    checker = KM(n_clusters=times)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    newtx = np.append(td)
    newrx = np.append(rd)
    myNN(test, ty, result, ry, alg="KM_"+alg)
    nn(newtx, ty, newrx, ry, add="onKM"+add)
Exemple #20
0
            best_model = model
            best_error = model.error
        else:
            if best_error > model.error:
                best_error = model.error
                best_model = model

    label = best_model.predict(X)
    center = best_model.center

    ####### plot ######################
    plt.figure(1)
    plt.subplot(121)
    plt.title('my KMeans')
    plt.scatter(X[:, 0], X[:, 1], c=label)
    plt.scatter(center[:, 0], center[:, 1], c='r', marker='+')

    ####### compare to scikit ########
    from sklearn.cluster import KMeans as KM

    km = KM(n_clusters=k)
    km.fit(X)
    plt.subplot(122)
    plt.title('scikit KMeans')
    plt.scatter(X[:, 0], X[:, 1], c=km.labels_)
    plt.scatter(km.cluster_centers_[:, 0],
                km.cluster_centers_[:, 1],
                c='r',
                marker='+')
    plt.show()
Exemple #21
0
 def __init__(self, imgPixels, K):
     self.imgPixels = imgPixels
     self.KM = KM(n_clusters=K, random_state=0).fit(self.imgPixels)
Exemple #22
0
def kmtable(tx, ty, rx, ry, dataset=""):
    processed = []
    adj_rand = []
    v_meas = []
    mutual_info = []
    adj_mutual_info = []
    sil = []
    inertia = []


    compressor = PCA(n_components = tx[1].size/2)
    compressor.fit(tx, y=ty)
    pcatx = compressor.transform(tx)
    pcarx = compressor.transform(rx)
    p = []

    compressor = ICA(n_components = tx[1].size/2)
    compressor.fit(tx, y=ty)
    icatx = compressor.transform(tx)
    icarx = compressor.transform(rx)
    ic = []

    compressor = RandomProjection(tx[1].size/2)
    compressor.fit(tx, y=ty)
    rptx = compressor.transform(tx)
    rprx = compressor.transform(rx)
    r = []

    compressor = best(k=tx[1].size/2)
    compressor.fit(tx, y=ty)
    kbtx = compressor.transform(tx)
    kbrx = compressor.transform(rx)
    k = []
    for i in range(2,8):
        # clusters = {x:[] for x in range(i)}
        clf = KM(n_clusters=i)
        clf.fit(pcatx)
        test = clf.predict(pcatx)
        result = clf.predict(pcarx)
        p.append(metrics.v_measure_score(ry.ravel(), result))

        clf = KM(n_clusters=i)
        clf.fit(icatx)
        test = clf.predict(icatx)
        result = clf.predict(icarx)
        ic.append(metrics.v_measure_score(ry.ravel(), result))

        clf = KM(n_clusters=i)
        clf.fit(rptx)
        test = clf.predict(rptx)
        result = clf.predict(rprx)
        r.append(metrics.v_measure_score(ry.ravel(), result))

        clf = KM(n_clusters=i)
        clf.fit(kbtx)
        test = clf.predict(kbtx)
        result = clf.predict(kbrx)
        k.append(metrics.v_measure_score(ry.ravel(), result))
        # adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result))
        # v_meas.append(metrics.v_measure_score(ry.ravel(), result))
        # mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result))
        # adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result))
    plt.figure()
    plt.title(dataset+": KM Clustering & DR")
    plt.xlabel('Number of clusters')
    plt.ylabel('V Measure Score value')
    plt.plot(range(2,8), p, label="PCA")
    plt.plot(range(2,8), ic, label="ICA")
    plt.plot(range(2,8), r, label = "RP")
    plt.plot(range(2,8), k, label="KB")
    plt.legend()
    plt.ylim(ymin=-0.05, ymax=0.5)
    plt.savefig("KM_DR_"+dataset+"_VM.png", dpi=300)
Exemple #23
0
lower conversion price and more dilution to BankAmerica stock
holders, noted Daniel Williams, analyst with Sutro Group.
    Several analysts said that while they believe the Brazilian
debt problem will continue to hang over the banking industry
through the quarter, the initial shock reaction is likely to
ease over the coming weeks.
    Nevertheless, BankAmerica, which holds about 2.70 billion
dlrs in Brazilian loans, stands to lose 15-20 mln dlrs if the
interest rate is reduced on the debt, and as much as 200 mln
dlrs if Brazil pays no interest for a year, said Joseph
Arsenio, analyst with Birr, Wilson and Co.
    He noted, however, that any potential losses would not show
up in the current quarter.
'''

kmeans = KM(n_clusters=32, init='random', n_init=1, verbose=1)
kmeans.fit(features)

print kmeans.labels_

new_post_vector = vectorizer.transform([new_post_2])
new_post_label = kmeans.predict(new_post_vector)[0]

print "new posts label", new_post_label

similar_indices = (kmeans.labels_ == new_post_label).nonzero()[0]

similar = []

for i in similar_indices:
    dist = sp.linalg.norm((new_post_vector - features[i]).toarray())
Exemple #24
0

if __name__ == '__main__':
    dir_from = '../drosophila_kc167_1_images'
    dir_to = '../KMeansResults'
    number_clusters = [i for i in range(2, 70, 2)]  #[2, 4, 6, 8, 10, 12, 14, 16]
    for file_name in os.listdir(dir_from):
        array_time = []
        path_file = dir_from + '/' + file_name
        name_without_extension = file_name[:-4]
        dir_save = dir_to + '/' + name_without_extension + '/'
        # os.mkdir(dir_save)
        data = KMeans(path_file).pixels
        sse = []
        for K in number_clusters:
            path_file = '../drosophila_kc167_1_images/CPvalid1_48_40x_Tiles_p0003DAPI.TIF'
            #label, center = KMeans(path_file, K).kmeans()
            km = KM(K)
            km.fit(data)
            sse.append(km.inertia_)
        # Plot sse against k
        plt.figure(figsize=(6, 6))
        plt.plot(number_clusters, sse, '-o')
        plt.xlabel(r'Number of clusters *k*')
        plt.ylabel('Sum of squared distance');
        plt.show()
        exit()
            # center = np.uint8(center)
            # res = center[label].reshape((512, 512, 3))
            # Image.fromarray(res, 'RGB').save(dir_save + str(K) + '.bmp')
Exemple #25
0
X_train_people_nmf = nmf_decomp.transform(X_train_people)
X_test_people_nmf = nmf_decomp.transform(X_test_people)

nmf_decomp = NMF(n_components=5, init='nndsvd',
                 random_state=37).fit(X_train_energy)
X_train_energy_nmf = nmf_decomp.transform(X_train_energy)
X_test_energy_nmf = nmf_decomp.transform(X_test_energy)

nmf_decomp = NMF(n_components=100, random_state=37).fit(X_train_mnist)
X_train_mnist_nmf = nmf_decomp.transform(X_train_mnist)
X_test_mnist_nmf = nmf_decomp.transform(X_test_mnist)
print('nmf')

#k-Means Clustering
from sklearn.cluster import KMeans as KM
km_cluster = KM(n_clusters=10, random_state=37).fit(X_train_people)
X_train_people_km = km_cluster.transform(X_train_people)
X_test_people_km = km_cluster.transform(X_test_people)

km_cluster = KM(n_clusters=10, n_init=5, random_state=37).fit(X_train_energy)
X_train_energy_km = km_cluster.transform(X_train_energy)
X_test_energy_km = km_cluster.transform(X_test_energy)

km_cluster = KM(n_clusters=10, random_state=37).fit(X_train_mnist)
X_train_mnist_km = km_cluster.transform(X_train_mnist)
X_test_mnist_km = km_cluster.transform(X_test_mnist)
print('km\n')
"""
##############################Classification##############################
"""
from sklearn.neighbors import KNeighborsClassifier as KNC
Exemple #26
0
 def __init__(self):
     self.ma1 = TF.fit_transform(ma)
     self.model = KM(6)
     self.res = self.model.fit_predict(self.ma1)
Exemple #27
0
#先随机在数据中扔几个点,作为核心。再计算每一个点距离核心的位置。
#每个核心肯定会有边境点,之后计算每个核心分离的中间位置,把核心偏移过去。重新形成新的边境
#多次循环后核心就会稳定
import pandas as pda
import numpy as np
from sklearn.cluster import KMeans as KM
if __name__ == '__main__':
    data = pda.read_csv('luqu.csv')
    x = data.iloc[:, 1:4].as_matrix()
    km = KM(n_clusters=2, n_jobs=2)
    print(km.fit_predict(x))
Exemple #28
0
    for key, name in webcolors.css21_hex_to_names.items():
        r_c, g_c, b_c = webcolors.hex_to_rgb(key)
        rd = (r_c - rgb_[0])**2
        gd = (g_c - rgb_[1])**2
        bd = (b_c - rgb_[2])**2
        min_colors[(rd + gd + bd)] = name
    return min_colors[min(min_colors.keys())]


# Construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("--image", required=True, help="Input Image Path")
ap.add_argument("--clusters", required=True, type=int, help="# of clusters")
args = vars(ap.parse_args())

image = cv2.imread(args["image"])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Reshape the image to be a list of pixels
image = image.reshape((image.shape[0] * image.shape[1], 3))

# K-Means Clustering
clt = KM(n_clusters=args["clusters"])
clt.fit(image)

# Get dominant colors
colors = clt.cluster_centers_.astype("uint8").tolist()
for rgb in colors:
    color_name = get_color_name(rgb)
    print("Dominant color :", color_name)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# One hot encode target values
one_hot = OneHotEncoder()

y_train_hot = one_hot.fit_transform(y_train.reshape(-1, 1)).todense()
y_test_hot = one_hot.transform(y_test.reshape(-1, 1)).todense()

labels = y_train

dim = [2, 3, 4, 5]

km = KM(random_state=42)
gmm = GMM(random_state=42)

Score = defaultdict(list)
adjMI = defaultdict(list)
S_homog = defaultdict(list)
S_adjMI = defaultdict(list)
S_vm = defaultdict(list)

for i in dim:
    reduced_X = PCA(n_components=i,
                    random_state=42).fit_transform(X_train_scaled)
    k = 30
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(reduced_X)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans as KM

file = pd.read_csv("Mall_Customers.csv")
X = file.iloc[:, 3:5].values

elbow = []
for i in range(10):
    km = KM(n_clusters=i + 1)
    km.fit(X)
    elbow.append(km.inertia_)

plt.plot(range(1, 11), elbow)
plt.xlabel("No. of Clusters")
plt.ylabel("Cost")
plt.show()

km = KM(n_clusters=5)
res = km.fit_predict(X)

colors = ["red", "blue", "green", "yellow", "silver"]

for i in range(5):
    plt.scatter(X[res == i, 0], X[res == i, 1], c=colors[i])
plt.axes().get_xaxis().set_visible(False)
plt.axes().get_yaxis().set_visible(False)
plt.show()