Python KM Exemples, sklearn.cluster.KM Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : analysis2.py Projet : jj4192/MLp3

def km(tx, ty, rx, ry, add="", times=5):
    #this does the exact same thing as the above
    errs = []

    checker = KM(n_clusters=2)
    checker.fit(ry)
    truth = checker.predict(ry)

    # so we do this a bunch of times
    for i in range(2,times):
        clusters = {x:[] for x in range(i)}
        clf = KM(n_clusters=i)
        clf.fit(tx)  #fit it to our data
        test = clf.predict(tx)
        result = clf.predict(rx)  # and test it on the testing set
        for index, val in enumerate(result):
            clusters[val].append(index)
        mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)}
        processed = [mapper[val] for val in result]
        errs.append(sum((processed-truth)**2) / float(len(ry)))
    plot([0, times, min(errs)-.1, max(errs)+.1],[range(2, times), errs, "ro"], "Number of Clusters", "Error Rate", "KMeans clustering error", "KM"+add)

    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    newtx = np.append(tx, td, 1)
    newrx = np.append(rx, rd, 1)
    nn(newtx, ty, newrx, ry, add="onKM"+add)

Exemple #2

0

Afficher le fichier

 def KMeansRatio(self):
     '''
     Type: K-Means
     Y-axis: No Reaction
     X-axis: Reaction
     '''
     if self.authenticated:
         from sklearn.cluster import KMeans as KM
         algorithm = KM(n_clusters=2)
         categories = algorithm.fit_predict(self.allCoord)
         plt.scatter(self.allCoord[categories == 0, 0],
                     self.allCoord[categories == 0, 1],
                     c="green")
         plt.scatter(self.allCoord[categories == 1, 0],
                     self.allCoord[categories == 1, 1],
                     c="red")
         plt.scatter(algorithm.cluster_centers_[:, 0],
                     algorithm.cluster_centers_[:, 1],
                     c="black",
                     marker="*")
         for i, txt in enumerate(self.labels):
             plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1]))
         plt.ylabel("NO REACTION")
         plt.xlabel("REACTION")
         plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0])
         plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1])
         plt.title("K-Means: Reaction, No Reaction")
         plt.show()

Exemple #3

0

Afficher le fichier

 def KMeansPartPercent(self):
     '''
     Type: K-Means
     Y-axis: % Reactions
     X-axis: # Reactions
     '''
     if self.authenticated:
         from sklearn.cluster import KMeans as KM
         algorithm = KM(n_clusters=2)
         # partPercent = np.array([np.array([x, percent]) for j in self.stuff for _, x, _, percent in j])
         categories = algorithm.fit_predict(self.partPercent)
         plt.scatter(self.partPercent[categories == 0, 0],
                     self.partPercent[categories == 0, 1],
                     c="green")
         plt.scatter(self.partPercent[categories == 1, 0],
                     self.partPercent[categories == 1, 1],
                     c="red")
         plt.scatter(algorithm.cluster_centers_[:, 0],
                     algorithm.cluster_centers_[:, 1],
                     c="black",
                     marker="*")
         for i, txt in enumerate(self.labels):
             plt.annotate(txt,
                          (self.partPercent[i][0], self.partPercent[i][1]))
         plt.ylabel("PERCENT")
         plt.xlabel("NUM OF INFLAMS")
         plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0])
         plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1])
         plt.title("K-Means: # Reactions, % Reactions")
         plt.show()

Exemple #4

0

Afficher le fichier

Fichier : color_card.py Projet : Xinrui-Zhang/ProPainting

def extractColors(stream, maxColor):
    pixData = np.array(stream)
    h, w, d = pixData.shape
    data = np.reshape(pixData, (h * w, d))
    km = KM(n_clusters=maxColor)
    km.fit(data)
    theme = np.array(km.cluster_centers_, dtype=np.uint8).tolist()
    return getrgbAndHex(theme)

Exemple #5

0

Afficher le fichier

Fichier : kmeans.py Projet : mrjakobdk/fraud_detector

    def cluster(self, inputs):
        t = time()
        helper._print('Training clusters (KMeans)...')
        kmeans = KM(n_clusters=self.num_clusters, init=self.cluster_init, max_iter=1000, tol=0.000001)
        cluster_pred = kmeans.fit_predict(inputs)
        helper._print(f'Done training clusters. Finished in {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds!')

        return cluster_pred

Exemple #6

0

Afficher le fichier

 def __init__(self, data, k, t, iter, maxE):
     #if t == 0:
     #   t = 'k-means++'
     #else:
     #   t = 'random'
     self.kmean = KM(k, t, iter, maxE).fit(np.array(data))
     self.labels = self.kmean.labels_
     self.clusters = self.kmean.cluster_centers_

Exemple #7

0

Afficher le fichier

 def __init__(self, pixData, maxColor, useSklearn=True):
     super(KMeans, self).__init__()
     h, w, d = pixData.shape
     self.pixData = np.reshape(pixData, (h * w, d))
     self.maxColor = maxColor
     if useSklearn:
         self._KMeans = KM(n_clusters=maxColor)
     else:
         self._KMeans = KMDiy(n_clusters=maxColor)

Exemple #8

0

Afficher le fichier

Fichier : Corpusobject.py Projet : ar667/paston

    def do_KM(self, letter, k, iter=200, dump=True, parr=-2):
        if type(letter) == str:
            l, v = self.find_by_start(letter)
            km_obj = KM(n_clusters=k, max_iter=iter, n_jobs=parr)
            results = km_obj.fit(v)
            if dump == True:
                filename = DT.now().strftime(
                    '%d%m%y-%H%M%S') + self.generate_filename() + '-k' + str(k)
                self.dump(l, results.labels_, filename)
                self.km_labels = l
                self.km_clusters = results.labels_
                print('Stored KM output in self.km_labels, self.km_clusters')
            else:
                self.km_labels = l
                self.km_clusters = results.labels_
                print('Stored KM output in self.km_labels, self.km_clusters')

        if type(letter) == list:
            l, v = self.find_by_list(letter)
            km_obj = KM(n_clusters=k, max_iter=iter, n_jobs=parr)
            results = km_obj.fit(v)

            self.predicted_clusters = {}
            for v, k in zip(l, results.labels_):
                self.predicted_clusters[v] = k
            self.pred_prep = sorted(self.predicted_clusters.items(),
                                    key=itemgetter(0))
            self.predicted = [i[1] for i in self.pred_prep]

            if dump == True:
                filename = DT.now().strftime(
                    '%d%m%y-%H%M%S') + self.generate_filename() + '-k' + str(k)
                self.dump(l, results.labels_, filename)
                self.km_labels = l
                self.km_clusters = results.labels_
                print('Stored KM output in {self}.km_labels, self.km_clusters')
            else:
                self.km_labels = l
                self.km_clusters = results.labels_
                print('Stored KM output in self.km_labels, self.km_clusters')

Exemple #9

0

Afficher le fichier

Fichier : KMeans.py Projet : danielrbk/Discretisation

 def set_bin_ranges_for_property(
         self, property_to_entities: Dict[int, Set[Entity]],
         class_to_entities: Dict[int, Set[Entity]],
         property_to_timestamps: Dict[int,
                                      List[TimeStamp]], property_id: int):
     if not property_to_timestamps:
         self.load_property_to_timestamps(property_to_timestamps,
                                          property_id)
     property_values = np.array([
         ts.value for ts in property_to_timestamps[property_id]
     ]).reshape(-1, 1)
     kmeans = KM(n_clusters=self.bin_count - 1).fit(property_values)
     return list(
         sorted([centroid[0] for centroid in kmeans.cluster_centers_]))

Exemple #10

0

Afficher le fichier

Fichier : cluster.py Projet : hatooku/synapti-vision

def findClusterCenters(data):
    # prepare data
    nonzero_xy = np.nonzero(data)
    if len(nonzero_xy) != 2:
        return
    if nonzero_xy[0].size < 8:
        return
        
    processed_data = np.vstack((nonzero_xy[0], nonzero_xy[1])).transpose()
    
    # K-Means algorithm
    kmeans = KM(n_clusters = 8, max_iter=20, n_init = 5)
    kmeans.fit(processed_data)
    center_points = np.fliplr(kmeans.cluster_centers_)
    return center_points

Exemple #11

0

Afficher le fichier

Fichier : understanding.py Projet : GenericP3rson/InflaTracker-CAC-Challenge

 def KMeans(self):
     return1 = self.printPoints()  
     if not return1:
         return
     algorithm = KM(n_clusters=2)
     categories = algorithm.fit_predict(self.allCoord)
     print(self.allCoord)
     print(categories)
     plt.scatter(self.allCoord[categories == 0, 0], self.allCoord[categories == 0, 1], c= "green")
     plt.scatter(self.allCoord[categories == 1, 0],self.allCoord[categories == 1, 1], c="red")
     plt.scatter(algorithm.cluster_centers_[:, 0], algorithm.cluster_centers_[:, 1], c= "black", marker="*")
     print(len(self.labels), len(self.allCoord))
     for i, txt in enumerate(self.labels):
         plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1]))
     plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0])
     plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1])
     plt.savefig("static/" + NAME)
     self.src = NAME

Exemple #12

0

Afficher le fichier

    def KMeansPercentTotal(self):
        '''
        Type: K-Means
        Y-axis: % Reactions
        X-axis: # Observations
        '''
        if self.authenticated:
            from sklearn.cluster import KMeans as KM
            algorithm = KM(n_clusters=2)
            fig = plt.figure()
            # partPercent = np.array([np.array([x, percent]) for j in self.stuff for _, x, _, percent in j])
            categories = algorithm.fit_predict(self.percentTotal)
            plt.scatter(self.percentTotal[categories == 0, 0],
                        self.percentTotal[categories == 0, 1],
                        c="green")
            plt.scatter(self.percentTotal[categories == 1, 0],
                        self.percentTotal[categories == 1, 1],
                        c="red")
            plt.scatter(algorithm.cluster_centers_[:, 0],
                        algorithm.cluster_centers_[:, 1],
                        c="black",
                        marker="*")
            for i, txt in enumerate(self.labels):
                plt.annotate(
                    txt, (self.percentTotal[i][0], self.percentTotal[i][1]))
            plt.ylabel("PERCENT")
            plt.xlabel("TOTAL")
            plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0])
            plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1])
            plt.title("K-Means: # Observations, % Reactions")
            # plt.show()
            # mpld3.show()
            # plt.savefig()

            tmpfile = BytesIO()
            # plt.savefig('test.png')
            fig.savefig(tmpfile, format='png')
            encoded = base64.b64encode(tmpfile.getvalue())

            html = '<img src=\'data:image/png;base64,{}\'>'.format(
                encoded.decode("utf-8"))

            with open('KMeansPercentTotal.html', 'w') as f:
                f.write(html)

Exemple #13

0

Afficher le fichier

def ttsf(ID, CONTENTS):
    length = len(ID)
    wei = []
    for content in CONTENTS:
        cut = jieba.cut(content, cut_all=False)
        words = (' '.join(cut))
        wei.append(words)
    vector = TfidfVectorizer()
    tfidf = vector.fit_transform(wei)
    d = tfidf.toarray()
    k = 3
    clf = KM(k)
    hehe = clf.fit_predict(d)
    results = []
    for i in range(length):
        if hehe[i] == hehe[0] and i != 0:
            results.append(ID[i])
        else:
            None
    print(results)
    return results

Exemple #14

0

Afficher le fichier

def passl_local_graph_partial(site, loc_param_indices, params):
    X = site.buff[loc_param_indices[0]]
    K, rbf_sigma, local_graph_index, n_cluster, centers_index, point_cluster_index, inter_graph_index, member_id_index = params
    nins = NN(K + 1, None, metric='euclidean').fit(X)
    W = nins.kneighbors_graph(nins._fit_X, K + 1, mode='distance')
    #W.data=W.data**2
    W.data = np.exp(-W.data**2 / rbf_sigma)
    W[np.diag_indices(W.shape[0])] = 0
    #W[np.diag_indices(W.shape[0])]=0
    site.buff[local_graph_index] = W
    kins = KM(n_cluster)
    point_cluster = kins.fit_predict(X)
    site.buff[point_cluster_index] = point_cluster
    site.buff[centers_index] = kins.cluster_centers_
    #print(kins.cluster_centers_)
    site.buff[inter_graph_index] = {}
    member_id = []
    for i in range(n_cluster):
        member_id.append(np.where(point_cluster == i)[0])
        #print(member_id[-1])
    site.buff[member_id_index] = member_id

Exemple #15

0

Afficher le fichier

Fichier : kmeans.py Projet : parsareal/News-Search-Engine

    def cluster_list_k(self, k_list, data):
        data = sp.vstack(data, format='csr')
        # self.data = pd.DataFrame.sparse.from_spmatrix(v)
        # data = pd.DataFrame(sp.vstack(data, format='csr').toarray())
        # data = np.array(data)

        self.k_index = 0
        self.labels_list = []
        self.RSS_list = []
        self.centroids_list = []
        ks = []
        for k in k_list:
            # print('\n')
            print('\n' + str(k) + ":", end=" ")
            # # try:
            # labels, RSS, cents = KMeans.cluster_with_k(k, data)
            # self.RSS_list.append(RSS)
            # self.labels_list.append(np.array(labels))
            # self.centroids_list.append(cents)
            # ks.append(k)
            # # except:
            # #     print('Failed', end='')
            # try:
            sk = KM(k, n_init=1, max_iter=30).fit(data)
            self.labels_list.append(np.array(sk.labels_))
            self.centroids_list.append(sk.cluster_centers_)
            self.RSS_list.append(sk.inertia_)
            ks.append(k)
            # except:
            #     print('Failed', end='')

        plt.plot(ks, self.RSS_list)
        plt.savefig('./cluster_rss')
        plt.show()
        np.save("index" + file_post, [self.k_index])
        np.save("label" + file_post, self.labels_list)
        np.save("rss" + file_post, self.RSS_list)
        np.save("cent" + file_post, self.centroids_list)

Exemple #16

0

Afficher le fichier

def em(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""):
    clf = EM(n_components=times)
    clf.fit(reduced_data)
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # centroids = clf.cluster_centers_
    # plt.scatter(centroids[:, 0], centroids[:, 1],
    #             marker='x', s=169, linewidths=3,
    #             color='w', zorder=10)
    plt.title(dataset + ': EM clustering (' + alg + '-reduced data)')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()

    clf = EM(n_components=times)
    clf.fit(tx)  #fit it to our data
    test = clf.predict(tx)
    result = clf.predict(rx)
    checker = EM(n_components=times)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    # newtx = np.append(td)
    # newrx = np.append(rd)
    myNN(test, ty, result, ry, alg="EM_"+alg)
    errs = []
    scores = []
    # this is what we will compare to
    checker = EM(n_components=2)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    adj_rand = []
    v_meas = []
    mutual_info = []
    adj_mutual_info = []
    # so we do this a bunch of times
    for i in range(2,times):
        clusters = {x:[] for x in range(i)}
        # create a clusterer
        clf = EM(n_components=i)
        clf.fit(tx)  #fit it to our data
        test = clf.predict(tx)
        result = clf.predict(rx)  # and test it on the testing set
        for index, val in enumerate(result):
            clusters[val].append(index)
        mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)}
        processed = [mapper[val] for val in result]
        errs.append(sum((processed-truth)**2) / float(len(ry)))
        scores.append(clf.score(tx, ty))
        adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result))
        v_meas.append(metrics.v_measure_score(ry.ravel(), result))
        mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result))
        adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result))
    # plot([0, times, min(scores)-.1, max(scores)+.1],[range(2, times), scores, "-"], "Number of Clusters", "Log Likelihood", dataset+": EM Log Likelihood - " + alg, dataset+"_EM_"+alg)

    # other metrics
    # names = ["Adjusted Random", "V Measure", "Mutual Info", "Adjusted Mutual Info"]
    plt.figure()
    plt.title(dataset+": EM Clustering measures - "+alg)
    plt.xlabel('Number of clusters')
    plt.ylabel('Score value')
    plt.plot(range(2,times),adj_rand, label="Adjusted Random")
    plt.plot(range(2,times),v_meas, label="V Measure")
    plt.plot(range(2,times),mutual_info, label = "Fowlkes Mallows Score")
    plt.plot(range(2,times),adj_mutual_info, label="Homogeneity Score")
    plt.legend()
    plt.savefig("EMMetrics"+dataset+"_"+alg+".png")

    kmeans = KM(n_clusters=2)
    kmeans.fit(reduced_data)

    Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

    Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title(dataset + ': EM clustering (' + alg + '-reduced data)\n'
              'Centroids are marked with a white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()

Exemple #17

0

Afficher le fichier

                                      categories=categories,
                                      shuffle=True,
                                      random_state=None,
                                      remove=('headers', 'footers'))
fp = open('Result/metrics.txt', 'w+')

# Question 1
data = dataset.data
target = dataset.target
data_matrix = tf_idf_matrix(data)
fp.write('tf_idf_matrix shape: {}\n'.format(data_matrix.shape))

# Question 2
target = [1 if i > 3 else 0 for i in target]

cluster = KM(n_clusters=2, random_state=0, max_iter=1000, n_init=30)
cluster.fit(data_matrix)
predict_target = cluster.labels_
fp.write("Contingency table: \n{}\n".format(
    contingency_matrix(target, predict_target)))

# Question 3
metrics_dict = {
    "homogeneity_score": homogeneity_score,
    'completeness_score': completeness_score,
    "v_measure_score": v_measure_score,
    "adjusted_rand_score": adjusted_rand_score,
    "adjusted_mutual_info_score": adjusted_mutual_info_score
}

for metrics_name in metrics_dict.keys():

Exemple #18

0

Afficher le fichier

# SCALE THE DATA
scaler = SS()
scaler.fit(df)
scaled_data = scaler.transform(df)

# PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

# K MEANS - MAKES A LOT OF SENSE TO APPLY THIS ON TOP OF PCA

from sklearn.cluster import KMeans as KM

model = KM(n_clusters=2)
model.fit(df)
fig, axes = plt.subplots(1, 2, figsize=(10, 6))
fig.suptitle('Breast Cancer', fontsize=20)

axes[0].set_title('Diagnosis')
axes[0].scatter(
    x_pca[:, 0],
    x_pca[:, 1],
    c=model.labels_,
    cmap='coolwarm',
)
axes[1].set_title('KMC')
axes[1].scatter(x_pca[:, 0], x_pca[:, 1], c=cancer['target'], cmap='coolwarm')

plt.show()

Exemple #19

0

Afficher le fichier

def km(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""):
    processed = []
    adj_rand = []
    v_meas = []
    mutual_info = []
    adj_mutual_info = []
    sil = []
    inertia = []
    for i in range(2,times):
        clusters = {x:[] for x in range(i)}
        clf = KM(n_clusters=i)
        clf.fit(tx)
        test = clf.predict(tx)
        result = clf.predict(rx)

        adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result))
        v_meas.append(metrics.v_measure_score(ry.ravel(), result))
        mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result))
        adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result))
        inertia.append(clf.inertia_)
    plots = [adj_rand, v_meas, mutual_info, adj_mutual_info]
    plt.title(dataset+": KM Clustering measures - "+alg)
    plt.xlabel('Number of clusters')
    plt.ylabel('Score value')
    plt.plot(range(2,times), adj_rand, label="Adjusted Random")
    plt.plot(range(2,times), v_meas, label="V Measure")
    plt.plot(range(2,times), mutual_info, label = "Fowlkes Mallows Score")
    plt.plot(range(2,times), adj_mutual_info, label="Homogeneity Score")
    plt.legend()
    plt.ylim(ymin=-0.05, ymax=1.05)
    plt.savefig("KMeansMetric"+dataset+"_"+alg+".png")

    plt.figure()
    plt.title(dataset+": KMeans Inertia - "+alg)
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.plot(range(2,times), inertia)
    plt.savefig("KM-Inertia-"+dataset+"-"+alg+".png")

    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    newtx = np.append(tx, td, 1)
    newrx = np.append(rx, rd, 1)

    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
        # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    best_clusterer = KM(n_clusters=4)
    best_clusterer.fit(X)
    Z = best_clusterer.predict(X)
    print(len(Z))
    print(len(X))
    plt.figure(1)
    plt.clf()
    colors = ['r', 'g', 'b', 'y', 'c', 'm','#eeefff', '#317c15', '#4479b4', '#6b2b9c',
'#63133b', '#6c0d22', '#0c7c8c', '#67c50e','#c5670e', '#946c47', '#58902a', '#54b4e4',
'#e4549e', '#2b2e85'  ]
    for i in range(0, len(X)):
        plt.plot(X[i][0], X[i][1], marker='.', color=colors[Z[i]], markersize=2)
    #plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = best_clusterer.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='k', zorder=10)
    plt.title('K-means Clusters ' + alg)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()
    kmeans = KM(n_clusters=3)
    kmeans.fit(tx)
    result=pd.DataFrame(kmeans.transform(tx), columns=['KM%i' % i for i in range(3)])
    my_color = pd.Series(ty).astype('category').cat.codes
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(result['KM0'], result['KM1'], result['KM2'], c=my_color, cmap="Dark2_r", s=60)
    plt.show()
    reduced_data = PCA(n_components=2).fit_transform(tx)
    kmeans = KM(n_clusters=4)
    kmeans.fit(reduced_data)
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title(dataset + ': K-means clustering (' + alg + '-reduced data)\n'
              'Centroids are marked with a white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()

    checker = KM(n_clusters=2)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    clusters = {x:[] for x in range(4)}
    clf = KM(n_clusters=4)
    clf.fit(tx)  #fit it to our data
    test = clf.predict(tx)
    result = clf.predict(rx)  # and test it on the testing set
    for index, val in enumerate(result):
        clusters[val].append(index)
    mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(4)}
    processed = [mapper[val] for val in result]
    print(sum((processed-truth)**2) / float(len(ry)))
    clf = KM(n_clusters=times)
    clf.fit(tx)  #fit it to our data
    test = clf.predict(tx)
    result = clf.predict(rx)
    checker = KM(n_clusters=times)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    newtx = np.append(td)
    newrx = np.append(rd)
    myNN(test, ty, result, ry, alg="KM_"+alg)
    nn(newtx, ty, newrx, ry, add="onKM"+add)

Exemple #20

0

Afficher le fichier

            best_model = model
            best_error = model.error
        else:
            if best_error > model.error:
                best_error = model.error
                best_model = model

    label = best_model.predict(X)
    center = best_model.center

    ####### plot ######################
    plt.figure(1)
    plt.subplot(121)
    plt.title('my KMeans')
    plt.scatter(X[:, 0], X[:, 1], c=label)
    plt.scatter(center[:, 0], center[:, 1], c='r', marker='+')

    ####### compare to scikit ########
    from sklearn.cluster import KMeans as KM

    km = KM(n_clusters=k)
    km.fit(X)
    plt.subplot(122)
    plt.title('scikit KMeans')
    plt.scatter(X[:, 0], X[:, 1], c=km.labels_)
    plt.scatter(km.cluster_centers_[:, 0],
                km.cluster_centers_[:, 1],
                c='r',
                marker='+')
    plt.show()

Exemple #21

0

Afficher le fichier

 def __init__(self, imgPixels, K):
     self.imgPixels = imgPixels
     self.KM = KM(n_clusters=K, random_state=0).fit(self.imgPixels)

Exemple #22

0

Afficher le fichier

def kmtable(tx, ty, rx, ry, dataset=""):
    processed = []
    adj_rand = []
    v_meas = []
    mutual_info = []
    adj_mutual_info = []
    sil = []
    inertia = []


    compressor = PCA(n_components = tx[1].size/2)
    compressor.fit(tx, y=ty)
    pcatx = compressor.transform(tx)
    pcarx = compressor.transform(rx)
    p = []

    compressor = ICA(n_components = tx[1].size/2)
    compressor.fit(tx, y=ty)
    icatx = compressor.transform(tx)
    icarx = compressor.transform(rx)
    ic = []

    compressor = RandomProjection(tx[1].size/2)
    compressor.fit(tx, y=ty)
    rptx = compressor.transform(tx)
    rprx = compressor.transform(rx)
    r = []

    compressor = best(k=tx[1].size/2)
    compressor.fit(tx, y=ty)
    kbtx = compressor.transform(tx)
    kbrx = compressor.transform(rx)
    k = []
    for i in range(2,8):
        # clusters = {x:[] for x in range(i)}
        clf = KM(n_clusters=i)
        clf.fit(pcatx)
        test = clf.predict(pcatx)
        result = clf.predict(pcarx)
        p.append(metrics.v_measure_score(ry.ravel(), result))

        clf = KM(n_clusters=i)
        clf.fit(icatx)
        test = clf.predict(icatx)
        result = clf.predict(icarx)
        ic.append(metrics.v_measure_score(ry.ravel(), result))

        clf = KM(n_clusters=i)
        clf.fit(rptx)
        test = clf.predict(rptx)
        result = clf.predict(rprx)
        r.append(metrics.v_measure_score(ry.ravel(), result))

        clf = KM(n_clusters=i)
        clf.fit(kbtx)
        test = clf.predict(kbtx)
        result = clf.predict(kbrx)
        k.append(metrics.v_measure_score(ry.ravel(), result))
        # adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result))
        # v_meas.append(metrics.v_measure_score(ry.ravel(), result))
        # mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result))
        # adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result))
    plt.figure()
    plt.title(dataset+": KM Clustering & DR")
    plt.xlabel('Number of clusters')
    plt.ylabel('V Measure Score value')
    plt.plot(range(2,8), p, label="PCA")
    plt.plot(range(2,8), ic, label="ICA")
    plt.plot(range(2,8), r, label = "RP")
    plt.plot(range(2,8), k, label="KB")
    plt.legend()
    plt.ylim(ymin=-0.05, ymax=0.5)
    plt.savefig("KM_DR_"+dataset+"_VM.png", dpi=300)

Exemple #23

0

Afficher le fichier

lower conversion price and more dilution to BankAmerica stock
holders, noted Daniel Williams, analyst with Sutro Group.
    Several analysts said that while they believe the Brazilian
debt problem will continue to hang over the banking industry
through the quarter, the initial shock reaction is likely to
ease over the coming weeks.
    Nevertheless, BankAmerica, which holds about 2.70 billion
dlrs in Brazilian loans, stands to lose 15-20 mln dlrs if the
interest rate is reduced on the debt, and as much as 200 mln
dlrs if Brazil pays no interest for a year, said Joseph
Arsenio, analyst with Birr, Wilson and Co.
    He noted, however, that any potential losses would not show
up in the current quarter.
'''

kmeans = KM(n_clusters=32, init='random', n_init=1, verbose=1)
kmeans.fit(features)

print kmeans.labels_

new_post_vector = vectorizer.transform([new_post_2])
new_post_label = kmeans.predict(new_post_vector)[0]

print "new posts label", new_post_label

similar_indices = (kmeans.labels_ == new_post_label).nonzero()[0]

similar = []

for i in similar_indices:
    dist = sp.linalg.norm((new_post_vector - features[i]).toarray())

Exemple #24

0

Afficher le fichier

Fichier : k-means.py Projet : MetelevEvgenii/k-means


if __name__ == '__main__':
    dir_from = '../drosophila_kc167_1_images'
    dir_to = '../KMeansResults'
    number_clusters = [i for i in range(2, 70, 2)]  #[2, 4, 6, 8, 10, 12, 14, 16]
    for file_name in os.listdir(dir_from):
        array_time = []
        path_file = dir_from + '/' + file_name
        name_without_extension = file_name[:-4]
        dir_save = dir_to + '/' + name_without_extension + '/'
        # os.mkdir(dir_save)
        data = KMeans(path_file).pixels
        sse = []
        for K in number_clusters:
            path_file = '../drosophila_kc167_1_images/CPvalid1_48_40x_Tiles_p0003DAPI.TIF'
            #label, center = KMeans(path_file, K).kmeans()
            km = KM(K)
            km.fit(data)
            sse.append(km.inertia_)
        # Plot sse against k
        plt.figure(figsize=(6, 6))
        plt.plot(number_clusters, sse, '-o')
        plt.xlabel(r'Number of clusters *k*')
        plt.ylabel('Sum of squared distance');
        plt.show()
        exit()
            # center = np.uint8(center)
            # res = center[label].reshape((512, 512, 3))
            # Image.fromarray(res, 'RGB').save(dir_save + str(K) + '.bmp')

Exemple #25

0

Afficher le fichier

X_train_people_nmf = nmf_decomp.transform(X_train_people)
X_test_people_nmf = nmf_decomp.transform(X_test_people)

nmf_decomp = NMF(n_components=5, init='nndsvd',
                 random_state=37).fit(X_train_energy)
X_train_energy_nmf = nmf_decomp.transform(X_train_energy)
X_test_energy_nmf = nmf_decomp.transform(X_test_energy)

nmf_decomp = NMF(n_components=100, random_state=37).fit(X_train_mnist)
X_train_mnist_nmf = nmf_decomp.transform(X_train_mnist)
X_test_mnist_nmf = nmf_decomp.transform(X_test_mnist)
print('nmf')

#k-Means Clustering
from sklearn.cluster import KMeans as KM
km_cluster = KM(n_clusters=10, random_state=37).fit(X_train_people)
X_train_people_km = km_cluster.transform(X_train_people)
X_test_people_km = km_cluster.transform(X_test_people)

km_cluster = KM(n_clusters=10, n_init=5, random_state=37).fit(X_train_energy)
X_train_energy_km = km_cluster.transform(X_train_energy)
X_test_energy_km = km_cluster.transform(X_test_energy)

km_cluster = KM(n_clusters=10, random_state=37).fit(X_train_mnist)
X_train_mnist_km = km_cluster.transform(X_train_mnist)
X_test_mnist_km = km_cluster.transform(X_test_mnist)
print('km\n')
"""
##############################Classification##############################
"""
from sklearn.neighbors import KNeighborsClassifier as KNC

Exemple #26

0

Afficher le fichier

Fichier : 聚类模块.py Projet : abjiesir/voice_system

 def __init__(self):
     self.ma1 = TF.fit_transform(ma)
     self.model = KM(6)
     self.res = self.model.fit_predict(self.ma1)

Exemple #27

0

Afficher le fichier

Fichier : 5-聚类 kmeans.py Projet : ssdwawa/python-

#先随机在数据中扔几个点，作为核心。再计算每一个点距离核心的位置。
#每个核心肯定会有边境点，之后计算每个核心分离的中间位置，把核心偏移过去。重新形成新的边境
#多次循环后核心就会稳定
import pandas as pda
import numpy as np
from sklearn.cluster import KMeans as KM
if __name__ == '__main__':
    data = pda.read_csv('luqu.csv')
    x = data.iloc[:, 1:4].as_matrix()
    km = KM(n_clusters=2, n_jobs=2)
    print(km.fit_predict(x))

Exemple #28

0

Afficher le fichier

    for key, name in webcolors.css21_hex_to_names.items():
        r_c, g_c, b_c = webcolors.hex_to_rgb(key)
        rd = (r_c - rgb_[0])**2
        gd = (g_c - rgb_[1])**2
        bd = (b_c - rgb_[2])**2
        min_colors[(rd + gd + bd)] = name
    return min_colors[min(min_colors.keys())]


# Construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("--image", required=True, help="Input Image Path")
ap.add_argument("--clusters", required=True, type=int, help="# of clusters")
args = vars(ap.parse_args())

image = cv2.imread(args["image"])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Reshape the image to be a list of pixels
image = image.reshape((image.shape[0] * image.shape[1], 3))

# K-Means Clustering
clt = KM(n_clusters=args["clusters"])
clt.fit(image)

# Get dominant colors
colors = clt.cluster_centers_.astype("uint8").tolist()
for rgb in colors:
    color_name = get_color_name(rgb)
    print("Dominant color :", color_name)

Exemple #29

0

Afficher le fichier

Fichier : Clustering_PCA_coverType.py Projet : maco668/Assignment3

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# One hot encode target values
one_hot = OneHotEncoder()

y_train_hot = one_hot.fit_transform(y_train.reshape(-1, 1)).todense()
y_test_hot = one_hot.transform(y_test.reshape(-1, 1)).todense()

labels = y_train

dim = [2, 3, 4, 5]

km = KM(random_state=42)
gmm = GMM(random_state=42)

Score = defaultdict(list)
adjMI = defaultdict(list)
S_homog = defaultdict(list)
S_adjMI = defaultdict(list)
S_vm = defaultdict(list)

for i in dim:
    reduced_X = PCA(n_components=i,
                    random_state=42).fit_transform(X_train_scaled)
    k = 30
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(reduced_X)

Exemple #30

0

Afficher le fichier

Fichier : CustomerSegregation.py Projet : div-yansh/CustomerSegregation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans as KM

file = pd.read_csv("Mall_Customers.csv")
X = file.iloc[:, 3:5].values

elbow = []
for i in range(10):
    km = KM(n_clusters=i + 1)
    km.fit(X)
    elbow.append(km.inertia_)

plt.plot(range(1, 11), elbow)
plt.xlabel("No. of Clusters")
plt.ylabel("Cost")
plt.show()

km = KM(n_clusters=5)
res = km.fit_predict(X)

colors = ["red", "blue", "green", "yellow", "silver"]

for i in range(5):
    plt.scatter(X[res == i, 0], X[res == i, 1], c=colors[i])
plt.axes().get_xaxis().set_visible(False)
plt.axes().get_yaxis().set_visible(False)
plt.show()