def test_pairwise_distances_argmin_min():
    # Check pairwise minimum distances computation for any metric
    X = [[0], [1]]
    Y = [[-1], [2]]

    Xsp = dok_matrix(X)
    Ysp = csr_matrix(Y, dtype=np.float32)

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # sparse matrix case
    Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
    assert_array_equal(Dsp, D)
    assert_array_equal(Esp, E)
    # We don't want np.matrix here
    assert_equal(type(Dsp), np.ndarray)
    assert_equal(type(Esp), np.ndarray)

    # Non-euclidean scikit-learn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])
    D, E = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
    D2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan", batch_size=50)
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
Beispiel #2
0
def test_pairwise_distances_argmin_min():
    """ Check pairwise minimum distances computation for any metric"""
    X = [[0], [1]]
    Y = [[-1], [2]]

    Xsp = dok_matrix(X)
    Ysp = csr_matrix(Y)

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # sparse matrix case
    Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
    assert_array_equal(Dsp, D)
    assert_array_equal(Esp, E)
    # We don't want np.matrix here
    assert_equal(type(Dsp), np.ndarray)
    assert_equal(type(Esp), np.ndarray)

    # Non-euclidean sklearn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # sparse matrix case
    assert_raises(ValueError,
                  pairwise_distances_argmin_min, Xsp, Ysp, metric="manhattan")

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan", batch_size=50)
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
Beispiel #3
0
    def fit(self, x_):
        self.cluster_centers_ = x_.sample(self.k_)
        n_iter_ = 0
        while n_iter_ < self.max_iter_:
            # 1. clustering: x -> ck
            if "cluster" in x_.columns.tolist():
                self.labels_ = x_["cluster"] = pairwise_distances_argmin(
                    x_.drop(["cluster"], axis=1),
                    self.cluster_centers_,
                    metric="euclidean")
            else:
                self.labels_ = x_["cluster"] = pairwise_distances_argmin(
                    x_, self.cluster_centers_, metric="euclidean")

            # 2. recalculate means_
            cluster_centers_ = x_.groupby(by=["cluster"]).mean().sort_values(
                by=x_.columns.tolist()[0])
            # distance(centers)
            dis_ = sum(
                sum((self.cluster_centers_.values -
                     cluster_centers_.values)**2))
            if dis_ < self.tol_:
                print("n_iter_ is %d, means_ is %s" %
                      (n_iter_, self.cluster_centers_))
                return self.cluster_centers_
            else:
                self.cluster_centers_ = cluster_centers_
            n_iter_ += 1

        print("n_iter_ is %d cluster_centers_ is %s" %
              (n_iter_, self.cluster_centers_.values))
        return self.cluster_centers_
    def run(self):
        self.run = True
        self.colors = ['#4EACC5', '#FF9C34', '#4E9A06']
        # KMeans
        k_means_cluster_centers = np.sort(self.k_means.cluster_centers_,
                                          axis=0)
        k_means_labels = pairwise_distances_argmin(self.X,
                                                   k_means_cluster_centers)
        for k, col in zip(range(self.n_clusters), self.colors):
            my_members = k_means_labels == k
            cluster_center = k_means_cluster_centers[k]

        self.t_mini_batch = time.time() - self.t0
        lastN_diff = 0
        while (self.run):
            # MiniBatchKMeans
            mbk_means_cluster_centers = np.sort(self.mbk.cluster_centers_,
                                                axis=0)
            mbk_means_labels = pairwise_distances_argmin(
                self.X, mbk_means_cluster_centers)
            order = pairwise_distances_argmin(k_means_cluster_centers,
                                              mbk_means_cluster_centers)
            for k, col in zip(range(self.n_clusters), self.colors):
                my_members = mbk_means_labels == order[k]
                cluster_center = mbk_means_cluster_centers[order[k]]

            # Initialise the different array to all False
            different = (mbk_means_labels == 4)
            nbK = np.arange(self.n_clusters)
            err = np.arange(self.n_clusters)
            nbL = np.arange(self.n_clusters)

            for k in range(self.n_clusters):
                different += ((k_means_labels == k) !=
                              (mbk_means_labels == order[k]))
                i = 0
                for s in mbk_means_labels:
                    if s == self.labels_true[i]:
                        nbK[k] += 1
                    if self.labels_true[i] == k:
                        nbL[k] += 1
                    i += 1

                err[k] = nbK[k] / nbL[k]

            identic = np.logical_not(different)

            n_diff = len(self.X[different, ])
            if lastN_diff != n_diff and (abs(lastN_diff - n_diff) <
                                         len(self.X) / 2):
                print('')
                '''for k in range(self.n_clusters):
    
     print('Error cluster %d : %f'%(k ,(nbK[k]/ nbL[k])))'''

                print('Difference K-Mean - Mini-batch: %d' % n_diff)
                ratio = n_diff / len(mbk_means_labels == 4)
                print('Ratio: %f' % ratio)
                lastN_diff = n_diff
Beispiel #5
0
def test_pairwise_distances_argmin_min():
    """ Check pairwise minimum distances computation for any metric"""
    X = [[0], [1]]
    Y = [[-1], [2]]

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean sklearn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X,
                                         Y,
                                         metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X,
                                         Y,
                                         metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan", batch_size=50)
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def computeHoGFeatures(XData):
    results = Parallel(n_jobs=num_cores)(delayed(getHOGFeatures)(XData[i])
                                         for i in range(XData.shape[0]))
    hog_descriptor = np.array(results)
    print("hog: ", hog_descriptor.shape)
    #    return results

    #Nxpx36
    hog_descriptor = np.reshape(hog_descriptor, (XData.shape[0], 16, 9))
    input_dim = hog_descriptor.shape[0]
    N = hog_descriptor.shape[1]
    pixels_per_image = hog_descriptor.shape[2]
    hog_ordered = np.transpose(hog_descriptor, (2, 0, 1))
    reshapedResponses = np.reshape(hog_ordered,
                                   (pixels_per_image, input_dim * N)).T
    K = 50
    clusters = ComputeKMeans(reshapedResponses, K)
    print("Cluster center dimension: ", clusters.shape)
    labels = pairwise_distances_argmin(reshapedResponses, clusters)
    labels = np.reshape(labels, (input_dim, N))

    bins = np.linspace(1, K, K)
    idx = np.searchsorted(bins, labels, 'right')

    scaled_idx = K * np.arange(labels.shape[0])[:, None] + idx
    limit = K * labels.shape[0]
    counts = np.bincount(scaled_idx.ravel(), minlength=limit + 1)[:-1]
    hist_hog = np.reshape(counts, (labels.shape[0], K))

    return hist_hog
def computeImageFeatures(XData, K=50):
    """ choosing fewer training samples to run the code faster.
    Run the entire dataset on Ada """
    num_samples = XData.shape[0]

    print("Input data dimensions: ", XData[:num_samples].shape)
    responseAllImages = []

    responseAllImages = getFilterResponses(XData[:num_samples])
    print("Filter Responses dimension: ", responseAllImages.shape)

    input_dim = responseAllImages.shape[0]
    N = responseAllImages.shape[1]
    pixels_per_image = responseAllImages.shape[2]
    reshapedResponses = np.reshape(responseAllImages,
                                   (input_dim, N * pixels_per_image)).T
    clusters = ComputeKMeans(reshapedResponses, K)
    print("Cluster center dimension: ", clusters.shape)
    labels = pairwise_distances_argmin(reshapedResponses, clusters)
    labels = np.reshape(labels, (N, pixels_per_image))
    """ parallelizing the code to make histogram computation more efficient
    ref: https://stackoverflow.com/questions/44152436/calculate-histograms-along-axis/44155607#44155607 
    """
    bins = np.linspace(1, K, K)
    idx = np.searchsorted(bins, labels, 'right')

    scaled_idx = K * np.arange(labels.shape[0])[:, None] + idx
    limit = K * labels.shape[0]
    counts = np.bincount(scaled_idx.ravel(), minlength=limit + 1)[:-1]
    hist = np.reshape(counts, (labels.shape[0], K))

    print("Histogram dimension: ", hist.shape)

    return hist
    def get_best_thread(self, question, tag_name):
        """
        Returns id of the most similar thread for the question.

        The search is performed across the threads with a given tag.

        Parameters
        ----------
        question : str
            The question asked
        tag_name : str
            The tag for the question

        Returns
        -------
        int
            The id of the most similar thread of the question
        """

        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        question_vec = question_to_vec(question=question,
                                       embeddings=self.word_embeddings,
                                       dim=thread_embeddings.shape[1])

        best_thread = pairwise_distances_argmin(question_vec[np.newaxis, ...],
                                                thread_embeddings,
                                                metric='cosine')

        return thread_ids[best_thread][0]
Beispiel #9
0
    def predict(self, X):
        """Predict the closest cluster for each sample in X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_query, n_features), \
                or (n_query, n_indexed) if metric == 'precomputed'
            New data to predict.

        Returns
        -------
        labels : array, shape = (n_query,)
            Index of the cluster each sample belongs to.
        """
        X = check_array(X, accept_sparse=["csr", "csc"])

        if self.metric == "precomputed":
            check_is_fitted(self, "medoid_indices_")
            return np.argmin(X[:, self.medoid_indices_], axis=1)
        else:
            check_is_fitted(self, "cluster_centers_")

            # Return data points to clusters based on which cluster assignment
            # yields the smallest distance
            return pairwise_distances_argmin(X,
                                             Y=self.cluster_centers_,
                                             metric=self.metric)
    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
            
            拿到该标签对应的候选集合
            将question转换为向量
            求最相似的问题标签
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
        print("thread_ids:", thread_ids)
        print("thread_embeddings:", thread_embeddings)

        # HINT: you have already implemented a similar routine in the 3rd assignment.
        #### YOUR CODE HERE ####
        question_vec = question_to_vec(question, self.word_embeddings,
                                       self.embeddings_dim)
        #### YOUR CODE HERE ####
        #从383456中找到最相似的
        #print(question_vec)
        #print(thread_embeddings)
        question_vec = question_vec.reshape(1, -1)  #只有一个样本,变成一行
        best_thread = pairwise_distances_argmin(question_vec,
                                                thread_embeddings)

        print("best_thread:", best_thread[0])
        #得打印出来看一下thread_ids的组成
        return thread_ids[best_thread[0]]
Beispiel #11
0
    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        # HINT: you have already implemented a similar routine in the 3rd assignment.

        question_vec = question_to_vec(
            question, self.word_embeddings,
            self.embeddings_dim)  #### YOUR CODE HERE ####
        """
        question_vec = np.reshape(question_vec,(1,self.embeddings_dim))
        sim_list = []
        for ind, can_emb in zip(thread_ids, thread_embeddings):
            sim = cosine_similarity(question_vec,can_emb)
            sim_list.append(sim[0][0])
        
        
        sort_ind = np.argsort(sim_list)[::-1]
        """
        print(thread_embeddings.shape)
        print(thread_ids.shape)

        best_thread = pairwise_distances_argmin(
            thread_embeddings, question_vec,
            axis=0)[0]  #sort_ind[0] #### YOUR CODE HERE ####
        print(best_thread)
        return thread_ids[best_thread]
    def aggpool(self):
        # 得到节点特征
        #self_vec =Ememory.node_feature_list()# 理论上应该有函数可以直接完成
        self_vec=[]#dataset=[]
        for node in list(self.Gmemory.nodes()):
            self_vec.append(self.Gmemory.nodes[node]['attributes'])
        # print("self_cec",self_vec)

        # self_vec_matrix = np.array(self_vec)
        # self_vec_matrix3 = self_vec_matrix[:,np.newaxis]

        # print("self_vec",self_vec_matrix)
        # 得到所有节点的表示
        Gnodes = [n for n in self.Gmemory.nodes()]
        # 在图中进行随机游走
        n_walks= 4
        pairs,feature_matrix = self.run_random_walks(self.Gmemory,Gnodes,self.memory_word_size,n_walks) # 为每个节点找到2跳邻居 5个, 用节点序号做标记
        # print("feature_matrix",feature_matrix)# (n,n_walks,dim_obs)
        #delta_feature = feature_matrix - self_vec_matrix3
        #得到特征向量
        neigh_vecs = tf.cast(feature_matrix,tf.float32)#列为节点个数,行为邻居个数,每个元素为一个向量
        #neigh_vecs = tf.cast(delta_feature,tf.float32)
        
        # 先进行aggregate
        #print("selfvec",self_vec,"neigh_vecs",neigh_vecs)
        #outputs = self.aggregator(self_vec,neigh_vecs) #训练参数
        outputs = self.aggregator.aggwithoutpara(self_vec,neigh_vecs)# 无参数,加和平均
        #print("output",outputs)
        # 再进行pooling
        # k=0.3
        # subgraph,center_list = self.sagpooling(self.Gmemory,Gnodes,outputs,k)
        # 最后得到子图的节点
        #center_list = subgraph.nodes()

        # 使用聚类的算法得到下采样
        FeatureDict =dict(zip(Gnodes,list(outputs)))
        
        #print("FeatureDict",keys)

        # for node in Gnodes:
        #     FeatureDict[node]
        t1 = 120
        t2 = 100
        self.gc = Cluster()#这个要在原函数上改改,不能直接放在这里
        self.gc.setThreshold(t1,t2)
        canopies = self.gc.clustering(FeatureDict)
        #print("canopies",len(canopies))
        center_list=[]
        # for i in range(len(canopies)):
        #     center_list.append(canopies[i][0]) #这一步要把特征重新对应回标签上
        k_means = KMeans(n_clusters=len(canopies))
        k_means.fit_predict(outputs)
        k_means_cluster_centers = k_means.cluster_centers_
        #print("k_means_cluster_centers",k_means_cluster_centers)
        argmin = pairwise_distances_argmin(k_means_cluster_centers,outputs,metric='euclidean')
        #print("argmin",argmin)
        for t in argmin:
            center_list.append(Gnodes[t])
        
        return center_list
    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        # HINT: you have already implemented a similar routine in the 3rd assignment.

        question_vec = question_to_vec(
            question, self.word_embeddings,
            self.embeddings_dim)  #### YOUR CODE HERE ####
        #        tag_w2v = unpickle_file(os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag_name))
        #        flag = 0

        [best_thread] = pairwise_distances_argmin(X=question_vec.reshape(
            1, self.embeddings_dim),
                                                  Y=thread_embeddings,
                                                  metric='cosine')
        #        for i in range(len(thread_ids)):
        #            if i == 0:
        #                mx_sim = cos_sim(question_vec, thread_embeddings[0])
        #                best_thread = 0
        #                continue
        #            if cos_sim(question_vec, thread_embeddings[i]) > mx_sim:
        #                best_thread = i  #### YOUR CODE HERE ####
        #        print(best_thread)
        return thread_ids[best_thread]
Beispiel #14
0
    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        # HINT: you have already implemented a similar routine in the 3rd assignment.

        question_vec = np.reshape(
            question_to_vec(question, self.word_embeddings,
                            self.embeddings_dim),
            (1, self.embeddings_dim))  #### YOUR CODE HERE ####
        print(thread_embeddings)
        print('Qvec')
        print(question_vec)
        print('dist_argmin_ans',
              pairwise_distances_argmin(thread_embeddings, question_vec))
        sim_vals = cosine_similarity(thread_embeddings, question_vec)

        best_thread = np.argmax(sim_vals[:, 0])  #### YOUR CODE HERE ####
        print('best_thread', best_thread)
        print('sim_vals shape', sim_vals.shape)
        print('0th element of thread_ids', thread_ids.iloc[0])
        print('answer', thread_ids.iloc[best_thread])
        return thread_ids.iloc[best_thread]  #thread_ids[best_thread]
    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        # HINT: you have already implemented a similar routine in the 3rd assignment.

        #print('Thread Ranker : question',question)
        #print('Thread Ranker : tag_name',tag_name)

        question_vec = question_to_vec(question, self.word_embeddings,
                                       300).reshape(1,
                                                    -1)  # YOUR CODE HERE ####

        #print('Thread Ranker : question_vec',question_vec)

        # print(question_vec.shape)
        # print(thread_embeddings.shape)
        # print(thread_ids.shape)

        best_thread = pairwise_distances_argmin(question_vec,
                                                thread_embeddings)[0]

        # print(best_thread)

        # print(thread_ids[best_thread:best_thread+1])

        return thread_ids.values[best_thread]
Beispiel #16
0
    def get_best_thread(self, question, tag_name):

        #Returns id of the most similar thread for the question.
        #The search is performed across the threads with a given tag.

        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
        question_vec = question_to_vec(question, self.word_embeddings,
                                       self.embeddings_dim)
        best_thread = pairwise_distances_argmin(X=question_vec.reshape(
            1, self.embeddings_dim),
                                                Y=thread_embeddings,
                                                metric='cosine')
        best_thread_similarity = np.min(
            pairwise_distances(X=question_vec.reshape(1, self.embeddings_dim),
                               Y=thread_embeddings,
                               metric='cosine'))
        #print(best_thread_similarity)
        reply = self.programming.Main(question)
        if reply != "Please refer kammand prompt discord or ask you mentor for more info :)":
            return reply
        else:
            if best_thread_similarity >= 0.45:
                return f'I think its about {tag_name}\n This thread might help you: https://stackoverflow.com/questions/{thread_ids[best_thread][0]}'
            else:
                return "Please refer to kammand prompt discord or ask for your mentor"
Beispiel #17
0
    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        print("@@@@@@@@")
        print(question)
        print(tag_name)
        print(type(tag_name))
        print("@@@@@@@@")
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        # HINT: you have already implemented a similar routine in the 3rd assignment.

        # question_vec =question_to_vec(question, embeddings=self.word_embeddings, dim=50)
        # best_thread = rank_candidates(question_vec, thread_embeddings, dim=50)
        question = text_prepare(question)
        question_vec = question_to_vec(question, self.word_embeddings,
                                       50).reshape(1, -1)
        print('---')
        print("vecs:")
        print(question_vec[:, :5])
        print('~~')
        print(thread_embeddings[:3, :5])
        print('---')

        best_thread = pairwise_distances_argmin(question_vec,
                                                thread_embeddings,
                                                metric='cosine')[0]
        print(best_thread)
        print(thread_ids[best_thread])
        return thread_ids[best_thread]
def doKmeans(matrix, k, metric='cosine', batch_size=1024):
    mbk = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=batch_size,
                      n_init=10, max_no_improvement=10, verbose=0)

    mbk.fit(matrix)

    return pairwise_distances_argmin(X=matrix, Y=mbk.cluster_centers_, metric=metric)
Beispiel #19
0
def k_means_function(clust_data, figsize=(18, 6), **kwargs):
    k_means = KMeans(**kwargs)
    k_means.fit(clust_data)
    k_means_cluster_centers = k_means.cluster_centers_
    k_means_labels = pairwise_distances_argmin(clust_data,
                                               k_means_cluster_centers)
    plots_count = clust_data.shape[1] - 1
    f, ax = plt.subplots(1, plots_count, figsize=figsize)
    for i in range(kwargs.get('n_clusters')):
        my_members = k_means_labels == i
        cluster_center = k_means_cluster_centers[i]
        for j in range(plots_count):
            axis = ax if plots_count == 1 else ax[j]
            axis.plot(clust_data[my_members, j],
                      clust_data[my_members, j + 1],
                      'p',
                      markerfacecolor=colors[i],
                      marker='o',
                      markeredgecolor=colors[i],
                      markersize=4)
            axis.plot(cluster_center[j],
                      cluster_center[j + 1],
                      'o',
                      markerfacecolor=colors[i],
                      markeredgecolor='k',
                      markersize=8)
    return k_means_labels, ax
Beispiel #20
0
    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)

        if MAX_TRDS_TO_LOAD is not None:  # sample a predefined number of tags
            indices = range(0, thread_ids.shape[0])
            random_indices_choice = np.random.choice(indices,
                                                     size=min(
                                                         len(indices),
                                                         MAX_TRDS_TO_LOAD),
                                                     replace=False)
            thread_ids = thread_ids[random_indices_choice, ]
            thread_embeddings = thread_embeddings[random_indices_choice, ]

        question_vec = question_to_vec(question, self.word_embeddings,
                                       self.embeddings_dim)

        min_dist = pairwise_distances_argmin(question_vec.reshape(1, -1),
                                             thread_embeddings,
                                             axis=1,
                                             metric='cosine')

        best_thread = min_dist[0]

        return thread_ids[best_thread]
Beispiel #21
0
    def fit(
        self,
        x_,
    ):
        all_label = set(x_["label"].values)
        vectors_ = x_.sample(self.q_)
        # cover every class of samples
        while set(vectors_["label"].values) != all_label:
            vectors_ = x_.sample(self.q_)

        self.vectors_ = vectors_.drop(["label"], axis=1).values
        self.labels_ = vectors_["label"].values

        n_iter_ = 0
        while n_iter_ < self.max_iter_:
            #
            sample_ = x_.sample(1).values[0]
            label_ = self.labels_[pairwise_distances_argmin([sample_[:-1]],
                                                            self.vectors_)[0]]
            if label_ == sample_[-1]:
                vectors_ = self.vectors_ + self.eta_ * (self.vectors_ -
                                                        sample_[:-1])
            else:
                vectors_ = self.vectors_ - self.eta_ * (self.vectors_ -
                                                        sample_[:-1])

            if sum(sum(self.vectors_ - sample_[:-1])**2) < self.tol_:
                print(n_iter_, self.vectors_)
                return self.vectors_

            self.vectors_ = vectors_

            n_iter_ += 1
        print(n_iter_, self.vectors_)
        return self.vectors_
Beispiel #22
0
def main():
    parser = argparse.ArgumentParser(description='K-Means in Python')
    parser.add_argument('-f',
                        '--filename',
                        help='Name of the file',
                        required=True)
    parser.add_argument('-k',
                        '--k',
                        help='The number of clusters',
                        required=True,
                        type=int)
    args = parser.parse_args()

    filename = args.filename
    k = args.k

    df = pd.read_csv(filename, converters={'date_time': parse_dates})
    date_time = df['date_time']
    df = df.drop('date_time', 1)

    start = time.time()
    k_means = KMeans(init='random', n_clusters=k).fit(df)  # init='k-means++'
    print("[KMEANS] Finish all in {} seconds".format(time.time() - start))

    k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0)
    k_means_labels = pairwise_distances_argmin(df.values,
                                               k_means_cluster_centers)

    df['date_time'] = date_time
    df['cluster'] = k_means_labels

    output_name = "/var/www/project/k_means_result_{}.txt".format(k)
    transform_save(df, output_name)
Beispiel #23
0
    def simple_center_adjustment(self, cluster_labels, cluster_sizes):

        labels = cluster_labels.copy()
        sizes = cluster_sizes.copy()

        centers = list(
            map(lambda x: self.calculate_geo_cluster_center(cluster_labels, x),
                list(range(self.num_cluster))))
        centroids_dist_min = pairwise.pairwise_distances_argmin(
            self.geo_data, centers)

        filtering = [
            list(np.where(labels == x)[0]) for x in cluster_sizes.index
        ]
        order_index = [item for elem in filtering for item in elem]

        for i in order_index:
            bestcl = centroids_dist_min[i].astype(int)
            actualcl = labels[i].astype(int)

            if actualcl != bestcl and sizes[bestcl] < self.cluster_max_size:
                sizes[bestcl] += 1
                sizes[actualcl] -= 1
                labels[i] = bestcl

        return labels, sizes
def filter_level_pruning(percents):
    global args, best_prec1
    args = parser.parse_args()

    model = torch.nn.DataParallel(resnet.__dict__[args.arch]())
    model.cuda()

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.evaluate, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    # print(model)
    val_loader = torch.utils.data.DataLoader(datasets.CIFAR10(
        root='./data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]),
        download=True),
                                             batch_size=128,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    criterion = nn.CrossEntropyLoss().cuda()

    num_clusters_full = calc_num_clusters(model, None, percents)
    print(num_clusters_full)
    i = 0
    with torch.no_grad():
        for name, param in model.named_parameters():
            if "conv" in name:
                num_channels, b, c, d = param.shape
                num_clusters = num_clusters_full[i]
                X = param.view(num_channels, -1).cpu()
                k_means = KMeans(init='k-means++',
                                 n_clusters=num_clusters,
                                 n_init=10)
                k_means.fit(X)
                cluster_centers = torch.from_numpy(k_means.cluster_centers_)
                cluster_ids_x = pairwise_distances_argmin(X, cluster_centers)
                for n, channel in enumerate(param):
                    param[n] = cluster_centers[cluster_ids_x[n]].view(b, c, d)
                i += 1
    return validate(val_loader, model, criterion)
Beispiel #25
0
def nearest_neighbor(s_samples, s_classes):
    """
    :param s_samples: (n_samples, attributes_dim)
    :param s_classes: (n_classes, attributes_dim)
    :return: (n_samples, ), 返回每个测试样本在S空间距离最近的class index
    """
    min_dist_pos = pairwise_distances_argmin(s_samples, s_classes)
    return np.array(min_dist_pos)
Beispiel #26
0
def array_to_label(X_array,n_clusters=3):
    batch_size = 45
    mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, batch_size=batch_size,
                        n_init=10, max_no_improvement=10, verbose=0)
    mbk.fit(X_array)
    mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0)
    mbk_means_labels = pairwise_distances_argmin(X_array, mbk_means_cluster_centers)
    return mbk_means_labels
Beispiel #27
0
def test_pairwise_distances_argmin_min():
    """ Check pairwise minimum distances computation for any metric"""
    X = [[0], [1]]
    Y = [[-1], [2]]

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean sklearn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan", batch_size=50)
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
    def get_best_scheme(self, question, scheme_name):
       
        scheme_ids, scheme_embeddings = self.__load_embeddings_by_scheme(scheme_name)

        
        question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim).reshape(1,-1)
        best_scheme = pairwise_distances_argmin(question_vec, scheme_embeddings)[0]
        
        return scheme_ids[best_scheme]
Beispiel #29
0
def find_clusters(x, n):
    # поиск кластеров
    k_means = KMeans(n_clusters=n, n_init=100)
    k_means.fit(x)

    # определение элементов, попавших в каждый кластер
    k_means_centers = np.sort(k_means.cluster_centers_, axis=0)
    k_means_indexes = pairwise_distances_argmin(x, k_means_centers)
    return k_means_centers, k_means_indexes
Beispiel #30
0
    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
        question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim)[np.newaxis, :]
        best_thread = pairwise_distances_argmin(question_vec, thread_embeddings, metric='cosine')[0]

        return thread_ids[best_thread]
    def get_best_category(self, question, category_name):
        
        category_ids, category_embeddings = self.__load_embeddings_by_category(category_name)

        
        question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim).reshape(1,-1)
      
        best_category = pairwise_distances_argmin(question_vec, category_embeddings)[0]
        
        return category_ids[best_category]
Beispiel #32
0
 def find_nearest(self, X, means):
     nearest = []
     for _ in means:
         nearest.append([])
     #print(means)
     index = pairwise_distances_argmin(X, means)
     #print(index)
     for i, ind in enumerate(index):
         nearest[ind].append(X[i])
     return nearest
Beispiel #33
0
 def _k_means_discriminator(self, batch_size=45):
     from sklearn.cluster import MiniBatchKMeans
     from sklearn.metrics.pairwise import pairwise_distances_argmin
     
     mbk = MiniBatchKMeans(init='k-means++', n_clusters=2, batch_size=batch_size,
                   n_init=10, max_no_improvement=10, verbose=0)
     #t0 = time.time()
     X = np.log10(self.ax.reshape(-1, 1))
     mbk.fit(X)
     cc = np.sort(mbk.cluster_centers_,axis=0)
     self.clusters = pairwise_distances_argmin(X,cc)
 def patch_partition(self, patch_list, n_clusters=30,patch_size=(21,21)):
     """
     输入path列表,返回聚类中心,和聚类标签
     :param patch_list:
     :param n_clusters:
     :return:
     """
     patch_data = np.array(patch_list)
     patch_data = np.reshape(patch_data, (-1, self.patch_size * self.patch_size * self.patch_depth))
     k_means = KMeans(n_clusters=n_clusters)
     t0 = time.time()
     k_means.fit(patch_data)
     k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0)
     k_means_labels = pairwise_distances_argmin(patch_data, k_means_cluster_centers)
     t_batch = time.time() - t0
     print ("%d个patch进行Kmean聚类,分成%d类,耗时%d秒" % (patch_data.shape[0], n_clusters, t_batch))
     return np.reshape(np.array(k_means_cluster_centers), (-1, patch_size[0], patch_size[1])), k_means_labels
Beispiel #35
0
 def _k_means_discriminator(self, batch_size=45):
     from sklearn.cluster import MiniBatchKMeans
     from sklearn.metrics.pairwise import pairwise_distances_argmin
     
     mbk = MiniBatchKMeans(init='k-means++', n_clusters=self.nsources+1, 
                           batch_size=batch_size,
                           n_init=10, max_no_improvement=10, verbose=0)
     #t0 = time.time()
     X = np.log10(self.ax)
     mbk.fit(X)
     cc = np.zeros(mbk.cluster_centers_.shape)
     # index of cluster corresponding to silence
     idx_silence = np.argmin(np.sum(mbk.cluster_centers_,axis=1))
     cc[0,:] = mbk.cluster_centers_[idx_silence,:]
     idx_free = range(cc.shape[0])
     idx_free.remove(idx_silence)
     cred = mbk.cluster_centers_-cc[0,:]
     # remaining indexes, sort them by channel
     used_chan=[]
     nchan = cc.shape[1]
     last_unmatched=0
     while idx_free:
         crem = cred[idx_free,:]
         r,idx_chan = np.unravel_index(crem.argmax(),crem.shape)
         idx_center = idx_free[r]
         if idx_chan not in used_chan:
             this_center = idx_chan+1
         else:
             # append to end of list
             this_center = cc.shape[0]-last_unmatched-1
             sys.stderr.write('Cluster {} not matched to channel\n'.format(idx_center))
         cc[this_center,:]=mbk.cluster_centers_[idx_center,:]
         used_chan.append(idx_chan)
         idx_free.remove(idx_center)
     
     cc[1:,:] = np.delete(mbk.cluster_centers_,idx_silence,axis=0)
     #cc = mbk.cluster_centers_[idxs,:]
     self.clusters = pairwise_distances_argmin(X,cc)
     self.centers = cc
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')


plt.plot(tsne_emb[:, 0], tsne_emb[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = k_means.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)




k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0)
k_means_labels = pairwise_distances_argmin(tsne_emb, k_means_cluster_centers)
# KMeans
# for k in range(n_clusters):
#     my_members = k_means_labels == k
#     cluster_center = k_means_cluster_centers[k]
#     plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=10)




mc_words = frequency.most_common(200)
mc_words = [w[0] for w in mc_words]

final_points = []
final_voc = []
for word, values in dico.items():
def Gross_K_means(Data_list, n_clusters, n_init, max_iter, weight):

	# K-means
	#n_clusters = 9
	#n_init = 100
	#max_iter = 100
	# top 11 colors
	colors = ['firebrick', 'orange', 'red', 'yellow','green', 'tan', 'skyblue', 'blue', 'violet', 'grey','magenta']

	##############################################################################
	# convert to numpy array
	temp_data = Data_list
	Data_TH = np.array(temp_data[1:][:])
	# rewrite array format into data points
	DataPoints = []
	for i in range(len(Data_TH[0])):
		DataPoints.append([Data_TH[0,i],Data_TH[1,i],Data_TH[2,i],weight*Data_TH[3,i],weight*Data_TH[4,i],
			weight*Data_TH[5,i],weight*Data_TH[6,i]])
	DataPoints = np.array(DataPoints)

	##############################################################################
	# plot data for demonstration
	fig = plt.figure(figsize=(16, 16))
	fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
	
	# Score1 and Score2
	ax = fig.add_subplot(2, 3, 1)
	ax.plot(DataPoints[:, 0], DataPoints[:, 1], 'w', markerfacecolor='blue', marker='.')
	ax.set_title('Score1 vs Score2')
	label_range = [i-2 for i in range(14)]
	ax.set_xticks(label_range)
	ax.set_xticklabels(label_range)
	ax.set_yticks(label_range)
	ax.set_yticklabels(label_range)	

	# Muser_degree_abs and Muser_degree_ws
	ax = fig.add_subplot(2, 3, 2)
	ax.plot(DataPoints[:, 3], DataPoints[:, 4], 'w', markerfacecolor='blue', marker='.')
	ax.set_title('User_Degree_abs vs User_Degree_weighted')
	label_range = [weight*(i+2) for i in range(10)]
	ax.set_xticks(label_range)
	ax.set_xticklabels(label_range)
	ax.set_yticks(label_range)
	ax.set_yticklabels(label_range)	

	# Tag_Degree_abs and Tag_Degree_weighted
	ax = fig.add_subplot(2, 3, 3)
	ax.plot(DataPoints[:, 5], DataPoints[:, 6], 'w', markerfacecolor='blue', marker='.')
	ax.set_title('Tag_Degree_abs vs Tag_Degree_weighted')
	label_range = [weight*(i+2) for i in range(10)]
	ax.set_xticks(label_range)
	ax.set_xticklabels(label_range)
	ax.set_yticks(label_range)
	ax.set_yticklabels(label_range)	

	# Score1 and Norm_totalAction
	ax = fig.add_subplot(2, 3, 4)
	ax.plot(DataPoints[:, 0], DataPoints[:, 2], 'w', markerfacecolor='blue', marker='.')
	ax.set_title('Score1 vs Norm_totalAction')
	label_x = [i-2 for i in range(14)]
	label_y = [i+2 for i in range(10)]
	ax.set_xticks(label_x)
	ax.set_xticklabels(label_x)
	ax.set_yticks(label_y)
	ax.set_yticklabels(label_y)	

	# Score2 and Norm_totalAction
	ax = fig.add_subplot(2, 3, 5)
	ax.plot(DataPoints[:, 1], DataPoints[:, 2], 'w', markerfacecolor='blue', marker='.')
	ax.set_title('Score2 vs Norm_totalAction')
	label_x = [i-2 for i in range(14)]
	label_y = [i+2 for i in range(10)]
	ax.set_xticks(label_x)
	ax.set_xticklabels(label_x)
	ax.set_yticks(label_y)
	ax.set_yticklabels(label_y)	
	
	# tag_degree_abs and user_degree_abs
	ax = fig.add_subplot(2, 3, 6)
	ax.plot(DataPoints[:, 5], DataPoints[:, 3], 'w', markerfacecolor='blue', marker='.')
	ax.set_title('tag_degree_abs vs user_degree_abs')
	label_x = [weight*(i+2) for i in range(10)]
	label_y = [weight*(i+2) for i in range(10)]
	ax.set_xticks(label_x)
	ax.set_xticklabels(label_x)
	ax.set_yticks(label_y)
	ax.set_yticklabels(label_y)	

	plt.savefig('../output/Tag_DataDemo.png')
	plt.show()


	##############################################################################

	##############################################################################

	# K-means with Scores ONLY
	print "K-means with Scores ONLY"
	k_means = KMeans(init='k-means++', n_clusters=n_clusters, n_init=n_init, max_iter = max_iter)
	k_means.fit(DataPoints[:,:2])
	Scores_labels = k_means.labels_
	Scores_cluster_centers = k_means.cluster_centers_
	Scores_labels_unique = np.unique(Scores_labels)

	# K-means with Scores and user_degree_abs and tag_degree_abs
	print "K-means with Scores and user_degree_abs and tag_degree_abs"
	index = [0,1,3,5]
	k_means.fit(DataPoints[:,index])

	Scores_bothDegree_labels = k_means.labels_
	Scores_bothDegree_centers = k_means.cluster_centers_
	Scores_bothDegree_unique = np.unique(Scores_bothDegree_labels)

	##############################################################################
	# We want to have the same colors for the same cluster from the
	# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
	# closest one.
	
	order_STU = pairwise_distances_argmin(Scores_cluster_centers, Scores_bothDegree_centers[:,0:2])

	print "\nScores_cluster_centers:\n ", Scores_cluster_centers
	print "\nScores_bothDegree_centers:\n ", Scores_bothDegree_centers[order_STU]

	##############################################################################

	##############################################################################

	index = [0,1,3,5]

	# PCA with Scores ONLY
	pca = PCA(n_components=4)
	X_r = pca.fit(DataPoints[:,index]).transform(DataPoints[:,index])

	##############################################################################

	##############################################################################
	# Plot result
	fig = plt.figure(figsize=(16, 16))
	fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
	
	# Scores
	ax = fig.add_subplot(2, 3, 1)
	for k, col in zip(range(n_clusters), colors):
		my_members = Scores_labels == k
		cluster_center = Scores_cluster_centers[k]	
		ax.plot(DataPoints[:, 0], DataPoints[:, 1], 'o', markerfacecolor='blue',
				markeredgecolor='k', markersize=6)

	ax.set_title('Full Data Set, Score1 vs Score2')
	label_range = [i-2 for i in range(14)]
	ax.set_xticks(label_range)
	ax.set_xticklabels(label_range)
	ax.set_yticks(label_range)
	ax.set_yticklabels(label_range)	

	# Scores and both_Degrees
	ax = fig.add_subplot(2, 3, 2)
	for k, col in zip(range(n_clusters), colors):
		my_members = Scores_bothDegree_labels == order_STU[k]
		cluster_center = Scores_bothDegree_centers[order_STU[k]]	
		ax.plot(DataPoints[my_members, 0], DataPoints[my_members, 1], 'o', markerfacecolor=col,
				markeredgecolor='k', markersize=6)

	ax.set_title('K-Means by Scores & Degrees, Score1 vs Score2')
	label_range = [i-2 for i in range(14)]
	ax.set_xticks(label_range)
	ax.set_xticklabels(label_range)
	ax.set_yticks(label_range)
	ax.set_yticklabels(label_range)	

	# user_degree vs tag_degree
	ax = fig.add_subplot(2, 3, 3)
	for k, col in zip(range(n_clusters), colors):
		my_members = Scores_bothDegree_labels == order_STU[k]
		cluster_center = Scores_bothDegree_centers[order_STU[k]]	
		ax.plot(DataPoints[my_members, 3], DataPoints[my_members, 5], 'o', markerfacecolor=col,
				markeredgecolor='k', markersize=6)

	ax.set_title('K-Means by Scores & Degrees, user_degree vs tag_degree')
	label_range = [weight*(i+2) for i in range(10)]
	ax.set_xticks(label_range)
	ax.set_xticklabels(label_range)
	ax.set_yticks(label_range)
	ax.set_yticklabels(label_range)	

	# PCA 1 vs 2
	ax = fig.add_subplot(2, 3, 4)
	for k, col in zip(range(n_clusters), colors):
		my_members = Scores_labels == k
		ax.plot(DataPoints[:, 3], DataPoints[:, 5], 'o', markerfacecolor='blue',
				markeredgecolor='k', markersize=6)
	ax.set_title('Full Data Set, user_degree vs tag_degree')

	# PCA 1 vs 2
	ax = fig.add_subplot(2, 3, 5)
	for k, col in zip(range(n_clusters), colors):
		my_members = Scores_bothDegree_labels == order_STU[k]
		ax.plot(X_r[my_members, 0], X_r[my_members, 1], 'o', markerfacecolor=col,
				markeredgecolor='k', markersize=6)
	ax.set_title('K-Means by Score & Degrees, PCA compnent 1 vs 2')

	# PCA 1 vs 2
	ax = fig.add_subplot(2, 3, 6)
	ax.plot(X_r[:, 0], X_r[:, 1], 'o', markerfacecolor='blue',
				markeredgecolor='k', markersize=6)
	ax.set_title('Full Data Set, PCA compnent 1 vs 2')

	####################################################################
	plt.savefig('../output/Tag_Kmeans_N{}_W{}.png'.format(n_clusters, weight))
	plt.show()
Beispiel #38
0
    def _k_mean(self,samples,pits):

        #print samples
        k_means = KMeans(init='k-means++', n_clusters=N_CLUSTERS, n_init=100)
        k_means.fit(samples)
        k_means_labels = k_means.labels_

        #print k_means.labels_
        k_means_cluster_centers = k_means.cluster_centers_
        classif=k_means.predict(samples)
        
        ####SAVE THE MODEL 
        
        #joblib.dump(k_means, 'kmean.pkl')
        
        ###RELOAD THE MODEL 
        
        #k_means = joblib.load('kmean.pkl') 
        
        batch_size = 100
        
        self._scores = classif
        
        
        classif_df=pd.DataFrame(classif,index=np.arange(1,len(classif)+1))
        classif_df.columns = ['scores']

        pit_df=pd.DataFrame(pits,index=np.arange(1,len(pits)+1))
        pit_df.columns = ['pits']

        result_per_pit = pd.concat([classif_df, pit_df], axis=1,verify_integrity=False)
        #print "result per pit in kmean function"
        #print result_per_pit
        #result_per_pit=pd.DataFrame(result_per_pit,index=np.arange(0,len(result_per_pit)))
        self.result = pd.concat([result_per_pit.groupby('pits')['scores'].sum(), result_per_pit.groupby('pits')['scores'].count()], axis=1,verify_integrity=False)
        self.result.columns = ['scores','count']
        self.result.to_csv(os.path.join(OUTPUT_DIR, "final_result.csv"),sep=",")
    #result_per_pit.columns = ['pits','scores','count']
        result_per_pit.to_csv(os.path.join(OUTPUT_DIR, "pre_result_simple_k_means_freedman.csv"),sep=",")
        #print result_per_pit.reindex(range(119))
        #print result_per_pit[:1]
        
        result_per_pit= result_per_pit[['scores','pits']].values
        result_per_pit_df=pd.DataFrame(result_per_pit,index=np.arange(len(result_per_pit)))
        result_per_pit_df.columns =['scores','pits']
        #result_per_pit_df = result_per_pit_df.set_index('index')
        
        grouped = result_per_pit_df.groupby('pits')
        
        for pit,cluster in grouped:
            print cluster
        print result_per_pit_df
        
        
        ############################################################################################
        # Compute clustering with MiniBatchKMeans
        ############################################################################################
        
        mbk = MiniBatchKMeans(init='k-means++', n_clusters=N_CLUSTERS, batch_size=batch_size,
                              n_init=100, max_no_improvement=10, verbose=0)
        
        t0 = time()

        mbk.fit(samples)
        t_mini_batch = time() - t0
        mbk_means_labels = mbk.labels_
        mbk_means_cluster_centers = mbk.cluster_centers_

        mbk_means_labels_unique = np.unique(mbk_means_labels)
        
        
        print mbk.labels_
        classif=mbk.predict(samples)
        print classif
        
        classif_df=pd.DataFrame(classif,index=np.arange(1,len(classif)+1))
        classif_df.columns = ['scores']

        pit_df=pd.DataFrame(pits,index=np.arange(1,len(pits)+1))
        pit_df.columns = ['pits']

        result_per_pit = pd.concat([classif_df, pit_df], axis=1,verify_integrity=False)
        print "result per pit in Mini Batch KMeans function"
        print result_per_pit
        result_per_pit.to_csv(os.path.join(OUTPUT_DIR, "pre_result_batch_k_means_freedman.csv"),sep=",")

        result_per_pit= result_per_pit[['scores','pits']].values
        result_per_pit_df=pd.DataFrame(result_per_pit,index=np.arange(len(result_per_pit)))
        result_per_pit_df.columns =['scores','pits']
        #result_per_pit_df = result_per_pit_df.set_index('index')
        
        grouped = result_per_pit_df.groupby('pits')
        
        for pit,cluster in grouped:
            print cluster
        print result_per_pit_df
        ##############################################################################
        # Plot result

        fig = plt.figure(figsize=(8, 3))
        fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
        colors = ['#4EACC5', '#FF9C34', '#4E9A06','#4E9A06']#,'#555555']

        # KMeans
        ax = fig.add_subplot(1, 3, 1)
        for k, col in zip(range(N_CLUSTERS), colors):
            my_members = k_means_labels == k
            cluster_center = k_means_cluster_centers[k]
            ax.plot(samples[my_members, 0], samples[my_members, 1], 'w',
                    markerfacecolor=col, marker='.')
            ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                    markeredgecolor='k', markersize=6)
        ax.set_title('KMeans')
        ax.set_xticks(())
        ax.set_yticks(())
        
        
        # We want to have the same colors for the same cluster from the
        # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
        # closest one.

        order = pairwise_distances_argmin(k_means_cluster_centers,
                                          mbk_means_cluster_centers)
        
        
        # MiniBatchKMeans
        ax = fig.add_subplot(1, 3, 2)
        for k, col in zip(range(N_CLUSTERS), colors):
            my_members = mbk_means_labels == order[k]
            cluster_center = mbk_means_cluster_centers[order[k]]
            ax.plot(samples[my_members, 0], samples[my_members, 1], 'w',
                    markerfacecolor=col, marker='.')
            ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                    markeredgecolor='k', markersize=6)
        ax.set_title('MiniBatchKMeans')
        ax.set_xticks(())
        ax.set_yticks(())
        plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' %
                 (t_mini_batch, mbk.inertia_))
        
    
        plt.show()
def test_pairwise_distances_argmin_min():
    # Check pairwise minimum distances computation for any metric
    X = [[0], [1]]
    Y = [[-2], [3]]

    Xsp = dok_matrix(X)
    Ysp = csr_matrix(Y, dtype=np.float32)

    expected_idx = [0, 1]
    expected_vals = [2, 2]
    expected_vals_sq = [4, 4]

    # euclidean metric
    idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    idx2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(idx2, expected_idx)
    assert_array_almost_equal(vals, expected_vals)
    # sparse matrix case
    idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
    assert_array_almost_equal(idxsp, expected_idx)
    assert_array_almost_equal(valssp, expected_vals)
    # We don't want np.matrix here
    assert_equal(type(idxsp), np.ndarray)
    assert_equal(type(valssp), np.ndarray)

    # euclidean metric squared
    idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean",
                                              metric_kwargs={"squared": True})
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(vals, expected_vals_sq)

    # Non-euclidean scikit-learn metric
    idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    idx2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(idx2, expected_idx)
    assert_array_almost_equal(vals, expected_vals)
    # sparse matrix case
    idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
    assert_array_almost_equal(idxsp, expected_idx)
    assert_array_almost_equal(valssp, expected_vals)

    # Non-euclidean Scipy distance (callable)
    idx, vals = pairwise_distances_argmin_min(X, Y, metric=minkowski,
                                              metric_kwargs={"p": 2})
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(vals, expected_vals)

    # Non-euclidean Scipy distance (string)
    idx, vals = pairwise_distances_argmin_min(X, Y, metric="minkowski",
                                              metric_kwargs={"p": 2})
    assert_array_almost_equal(idx, expected_idx)
    assert_array_almost_equal(vals, expected_vals)

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan")
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def Gross_K_means():

	#######################################################################

	MySQL_DBkey2 = {'host':'localhost', 'user':'', 'password':'', 'db':'','charset':'utf8mb4'}

	# command
	comd_Score_TH = "\
	select tagScore_T, tagSCore_H, step1_score_t, step1_score_h\n\
	from tag_compare_all\n\
	where step1_score_t > 0 or step1_score_h > 0 or tagScore_t >= 5 or tagScore_h >= 5;\n"

	temp_data = [[],[],[],[]]

	# Connect to the database
	connection = pymysql.connect(host=MySQL_DBkey2['host'],
								 user=MySQL_DBkey2['user'],
								 password=MySQL_DBkey2['password'],
								 db=MySQL_DBkey2['db'],
								 charset=MySQL_DBkey2['charset'],
								 cursorclass=pymysql.cursors.DictCursor)
	try: 
		with connection.cursor() as cursor:
			cursor.execute(comd_Score_TH)
			result = cursor.fetchall()
			# result is a list of dicts: {u'tagText': u'100yearsold'}
			for item in result:
				temp_data[0].append(item['tagScore_T'])
				temp_data[1].append(item['tagSCore_H'])
				temp_data[2].append(item['step1_score_t'])
				temp_data[3].append(item['step1_score_h'])
	finally:
		pass
	connection.close()

	#######################################################################
	# data check

	Data_TH = np.array(temp_data)

	plt.scatter(Data_TH[0],Data_TH[1],color='black')
	axes = plt.gca()
	axes.set_xlim([-1,11])
	axes.set_ylim([-1,11])

	plt.show()

	plt.scatter(Data_TH[2],Data_TH[3],color='black')
	axes = plt.gca()
	axes.set_xlim([-1,11])
	axes.set_ylim([-1,11])

	plt.show()

	##################################################################
	# rewrite array format into data points

	# only tagScores, x trump y hillary
	Data_tagScore_TH = []
	for i in range(len(Data_TH[1])):
		Data_tagScore_TH.append([Data_TH[0,i],Data_TH[1,i]])
	Data_tagScore_TH = np.array(Data_tagScore_TH)

	# only StepScores, x trump y hillary
	Data_stepScore_TH = []
	for i in range(len(Data_TH[1])):
		Data_stepScore_TH.append([Data_TH[2,i],Data_TH[3,i]])
	Data_stepScore_TH = np.array(Data_stepScore_TH)

	# 4D, x trump y hillary, tagscore then stepscore
	Data_4D_TH = []
	for i in range(len(Data_TH[1])):
		Data_4D_TH.append([Data_TH[0,i],Data_TH[1,i],Data_TH[2,i],Data_TH[3,i]])
	Data_4D_TH = np.array(Data_4D_TH)

	# re-fill empty step-score dimensions
	for i in range(len(Data_stepScore_TH)):
		# Data_stepScore_TH
		if Data_stepScore_TH[i,0] == 0:
			Data_stepScore_TH[i,0] = Data_tagScore_TH[i,0]
		if Data_stepScore_TH[i,1] == 0:
			Data_stepScore_TH[i,1] = Data_tagScore_TH[i,1]
		
	for i in range(len(Data_4D_TH)):
		# Data_4D_TH
		if Data_4D_TH[i,2] == 0:
			Data_4D_TH[i,2] = Data_tagScore_TH[i,0]
		if Data_4D_TH[i,3] == 0:
			Data_4D_TH[i,3] = Data_tagScore_TH[i,1]

	##############################################################################
	# post refil data check
	fig = plt.figure()

	ax = fig.add_subplot(3, 1, 1)
	ax.plot(Data_tagScore_TH[:, 0], Data_tagScore_TH[:, 1], 'w', markerfacecolor='blue', marker='.')
	ax.set_title('tagScore')
	ax.set_xticks(())
	ax.set_yticks(())

	ax = fig.add_subplot(3, 1, 2)
	ax.plot(Data_stepScore_TH[:, 0], Data_stepScore_TH[:, 1], 'w', markerfacecolor='blue', marker='.')
	ax.set_title('StepScore')
	ax.set_xticks(())
	ax.set_yticks(())

	ax = fig.add_subplot(3, 1, 3)
	ax.plot(Data_4D_TH[:, 2], Data_4D_TH[:, 3], 'w', markerfacecolor='blue', marker='.')
	ax.set_title('StepScore')
	ax.set_xticks(())
	ax.set_yticks(())

	plt.show()

	##############################################################################
	# K-means
	n_clusters = 8
	n_init = 500
	max_iter = 500
	# top 11 colors
	colors = ['firebrick', 'red', 'orange', 'yellow','tan', 'green', 'skyblue', 'blue', 'violet', 'magenta','black']

	##############################################################################
	# Compute tagScores with K-means
	k_means = KMeans(init='k-means++', n_clusters=n_clusters, n_init=n_init, max_iter = max_iter)
	k_means.fit(Data_tagScore_TH)
	TS_labels = k_means.labels_
	TS_cluster_centers = k_means.cluster_centers_
	TS_labels_unique = np.unique(TS_labels)

	##############################################################################
	# Compute StepScores with K-means
	k_means.fit(Data_stepScore_TH)

	SS_labels = k_means.labels_
	SS_cluster_centers = k_means.cluster_centers_
	SS_labels_unique = np.unique(SS_labels)

	##############################################################################
	# Compute in 4D with K-means
	k_means.fit(Data_4D_TH)

	full4D_labels = k_means.labels_
	full4D_cluster_centers = k_means.cluster_centers_
	full4D_labels_unique = np.unique(full4D_labels)

	##############################################################################
	# We want to have the same colors for the same cluster from the
	# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
	# closest one.

	order = pairwise_distances_argmin(TS_cluster_centers, SS_cluster_centers)
	order = pairwise_distances_argmin(TS_cluster_centers, full4D_cluster_centers[:,0:2])

	print "tagScore centers: ", TS_cluster_centers
	print "StepScore centers: ", SS_cluster_centers
	print "full 4D centers: ", full4D_cluster_centers

	##############################################################################
	# Plot result
	fig = plt.figure(figsize=(16, 16))
	fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
	# tagScore
	ax = fig.add_subplot(2, 3, 1)
	for k, col in zip(range(n_clusters), colors):
		my_members = TS_labels == k
		cluster_center = TS_cluster_centers[k]	
		ax.plot(Data_tagScore_TH[my_members, 0], Data_tagScore_TH[my_members, 1], 'w',
				markerfacecolor=col, marker='.')
		ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
				markeredgecolor='k', markersize=6)

	ax.set_title('tagScore')
	ax.set_xticks(())
	ax.set_yticks(())

	# StepScore
	ax = fig.add_subplot(2, 3, 2)
	for k, col in zip(range(n_clusters), colors):
		my_members = SS_labels == k
		cluster_center = SS_cluster_centers[order[k]]	
		ax.plot(Data_stepScore_TH[my_members, 0], Data_stepScore_TH[my_members, 1], 'w',
				markerfacecolor=col, marker='.')
		ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
				markeredgecolor='k', markersize=6)

	ax.set_title('StepScore')
	ax.set_xticks(())
	ax.set_yticks(())

	####################################################################
	# migrating blocks
	ax = fig.add_subplot(2, 3, 3)
	# migration
	for k, col in zip(range(n_clusters), colors):
		my_members = TS_labels == k
		cluster_center = TS_cluster_centers[k]
		ax.plot(Data_4D_TH[my_members, 2], Data_4D_TH[my_members, 3], 'w',
				markerfacecolor=col, marker='.')
	ax.set_title('Step.S clusters migrating in Step.S frame')
	ax.set_xticks(())
	ax.set_yticks(())

	####################################################################
	# 4D, TagScore
	ax = fig.add_subplot(2, 3, 4)
	for k, col in zip(range(n_clusters), colors):
		my_members = full4D_labels == order[k]
		cluster_center = full4D_cluster_centers[order[k]]
		ax.plot(Data_4D_TH[my_members, 0], Data_4D_TH[my_members, 1], 'w',
				markerfacecolor=col, marker='.')
		ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
				markeredgecolor='k', markersize=6)

	ax.set_title('full 4D, in TagScore')
	ax.set_xticks(())
	ax.set_yticks(())

	# 4D, StepScore
	ax = fig.add_subplot(2, 3, 5)
	for k, col in zip(range(n_clusters), colors):
		my_members = full4D_labels == order[k]
		cluster_center = full4D_cluster_centers[order[k]]
		ax.plot(Data_4D_TH[my_members, 2], Data_4D_TH[my_members, 3], 'w',
				markerfacecolor=col, marker='.')
		ax.plot(cluster_center[2], cluster_center[3], 'o', markerfacecolor=col,
				markeredgecolor='k', markersize=6)

	ax.set_title('full 4D, in StepScore')
	ax.set_xticks(())
	ax.set_yticks(())

	####################################################################
	# migrating blocks

	ax = fig.add_subplot(2, 3, 6)

	different = (full4D_labels == n_clusters+1)

	for k in range(n_clusters):
		different += ((TS_labels == k) != (full4D_labels == order[k]))

	identic = np.logical_not(different)
	ax.plot(Data_4D_TH[identic, 0], Data_4D_TH[identic, 1], 'w',
			markerfacecolor='#bbbbbb', marker='.')
	ax.plot(Data_4D_TH[different, 0], Data_4D_TH[different, 1], 'w',
			markerfacecolor='m', marker='.')

	ax.set_title('4D.S diff Tag.S in Tag.S frame')
	ax.set_xticks(())
	ax.set_yticks(())

	plt.savefig('../output/Gross_Kmeans_setNcluster_{}.png'.format(n_clusters))
	plt.show()
	####################################################################

	return '../output/Gross_Kmeans_setNcluster_{}.png'.format(n_clusters)
Beispiel #41
0
mbk.fit(X)
t_mini_batch = time.time() - t0

# #############################################################################
# Plot result

fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.
k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0)
mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0)
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)
order = pairwise_distances_argmin(k_means_cluster_centers,
                                  mbk_means_cluster_centers)

# KMeans
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(n_clusters), colors):
    my_members = k_means_labels == k
    cluster_center = k_means_cluster_centers[k]
    ax.plot(X[my_members, 0], X[my_members, 1], 'w',
            markerfacecolor=col, marker='.')
    ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
            markeredgecolor='k', markersize=6)
ax.set_title('KMeans')
ax.set_xticks(())
    # 预测分类
    norm1 = multivariate_normal(mu1, sigma1)
    norm2 = multivariate_normal(mu2, sigma2)
    tau1 = norm1.pdf(data)
    tau2 = norm2.pdf(data)

    fig = plt.figure(figsize=(10, 5), facecolor='w')
    ax = fig.add_subplot(121, projection='3d')
    ax.scatter(data[:, 0], data[:, 1], data[:, 2], c='b', s=30, marker='o', edgecolors='k', depthshade=True)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title('原始数据', fontsize=15)
    ax = fig.add_subplot(122, projection='3d')
    order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='euclidean')
    print(order)
    if order[0] == 0:
        c1 = tau1 > tau2
    else:
        c1 = tau1 < tau2
    c2 = ~c1
    acc = np.mean(y == c1)
    print('准确率:%.2f%%' % (100*acc))
    ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', edgecolors='k', depthshade=True)
    ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', edgecolors='k', depthshade=True)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title('EM算法分类', fontsize=15)
    plt.suptitle('EM算法的实现', fontsize=18)
mbk_means_labels = mbk.labels_
mbk_means_cluster_centers = mbk.cluster_centers_
mbk_means_labels_unique = np.unique(mbk_means_labels)

##############################################################################
# Plot result

fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.

order = pairwise_distances_argmin(k_means_cluster_centers,
                                  mbk_means_cluster_centers)

# KMeans
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(n_clusters), colors):
    my_members = k_means_labels == k
    cluster_center = k_means_cluster_centers[k]
    ax.plot(X[my_members, 0], X[my_members, 1], 'w',
            markerfacecolor=col, marker='.')
    ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
            markeredgecolor='k', markersize=6)
ax.set_title('KMeans')
ax.set_xticks(())
ax.set_yticks(())
plt.text(-3.5, 1.8,  'train time: %.2fs\ninertia: %f' % (
    t_batch, k_means.inertia_))
    y = pd.Categorical(data[4]).codes

    n_components = 3
    feature_pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]
    plt.figure(figsize=(8, 6), facecolor='w')
    for k, pair in enumerate(feature_pairs, start=1):
        x = x_prime[pair]
        m = np.array([np.mean(x[y == i], axis=0) for i in range(3)])  # 均值的实际值
        print('实际均值 = \n', m)

        gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
        gmm.fit(x)
        print('预测均值 = \n', gmm.means_)
        print('预测方差 = \n', gmm.covariances_)
        y_hat = gmm.predict(x)
        order = pairwise_distances_argmin(m, gmm.means_, axis=1, metric='euclidean')
        print('顺序:\t', order)

        n_sample = y.size
        n_types = 3
        change = np.empty((n_types, n_sample), dtype=np.bool)
        for i in range(n_types):
            change[i] = y_hat == order[i]
        for i in range(n_types):
            y_hat[change[i]] = i
        acc = '准确率:%.2f%%' % (100*np.mean(y_hat == y))
        print(acc)

        cm_light = mpl.colors.ListedColormap(['#FF8080', '#77E0A0', '#A0A0FF'])
        cm_dark = mpl.colors.ListedColormap(['r', 'g', '#6060FF'])
        x1_min, x2_min = x.min()