Beispiel #1
0
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignements are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(inertia, 0.0)

    # check warning when centers are passed
    with warnings.catch_warnings(record=True) as w:
        k_means(X, n_clusters=n_clusters, init=centers)
        assert_equal(len(w), 1)

    # to many clusters desired
    assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
Beispiel #2
0
def kmeans_analysis(G):
    block = nx.get_node_attributes(G,'block').values()  
    
    xA, xL = get_embedding(G,2)
    
    cA,kmA,_ = k_means(xA,2)
    cB,kmL,_ = k_means(xL,2)
    
#    plt.subplot(221); plt.scatter(xA[:,0],xA[:,1],c=block)
#    plt.subplot(222); plt.scatter(xA[:,0],xA[:,1],c=kmA)
#    plt.subplot(223); plt.scatter(xL[:,0],xL[:,1],c=block)
#    plt.subplot(224); plt.scatter(xL[:,0],xL[:,1],c=kmL)

    ax = plt.subplot(121); plt.scatter(xA[:,0],xA[:,1],c=block,marker='x')
    ax.set_aspect('equal','datalim')
    lim = plt.axis()
    a = cA[0,:]-cA[1,:]
    a = np.array([1, -a[0]/a[1]])
    b = np.mean(cA,axis=0)
    x = np.array([b+a,b-a])
    plt.plot(x[:,0],x[:,1],'k--',linewidth=1)
    plt.axis(lim)
    
    ax = plt.subplot(122); plt.scatter(xL[:,0],xL[:,1],c=block,marker='x')
    ax.set_aspect('equal','datalim')
    lim = plt.axis()
    a = cB[0,:]-cB[1,:]
    a = np.array([1, -a[0]/a[1]])
    b = np.mean(cB,axis=0)
    x = np.array([b+a,b-a])
    plt.plot(x[:,0],x[:,1],'k--',linewidth=1)
    plt.axis(lim)
    
    
    
    compare_results(block,kmA,kmL)
    
    _,kmA,_ = k_means(xA,5)
    _,kmL,_ = k_means(xL,5)
    
    print "ALL FIVE"
    num_diff = vn.num_diff_w_perms(block, kmA)
    ari = adjusted_rand_score(block,kmA)
    print "Adjacency: num error="+repr(num_diff)+" ari="+repr(ari)
    
    num_diff = vn.num_diff_w_perms(block, kmL)
    ari = adjusted_rand_score(block,kmL)
    print "Laplacian: num error="+repr(num_diff)+" ari="+repr(ari)
Beispiel #3
0
def getCenteroidsByGapStats(dataToCluster, maxCluster):
    # plot data
    xminData = np.min(dataToCluster[:,0])
    xmaxData = np.max(dataToCluster[:,0])
    yminData = np.min(dataToCluster[:,1])
    ymaxData = np.max(dataToCluster[:,1])
    numOfClusterRuns = maxCluster - 1
    sumSqMetricSave = np.zeros((1, numOfClusterRuns))
    wksMetricSave = np.zeros((1, numOfClusterRuns))
    wkbsMetricSave = np.zeros((1, numOfClusterRuns))
    kMetricSave = np.zeros((1, numOfClusterRuns), dtype=np.int32)
    skMetricSave = np.zeros((1, numOfClusterRuns))
    centeroidSave = []
    labelSave = []
    for clusterRun in  xrange(1, maxCluster):
        centroids, labels, inertia = k_means(dataToCluster, n_clusters = clusterRun)
        centeroidSave.append(centroids)
        labelSave.append(labels)
        kMetricSave[0, clusterRun-1] = clusterRun
        # calculate gap stattistics for selecting the number of clusters
        tempVar = calculateWk(centroids, labels, dataToCluster)
        sumSqMetricSave[0, clusterRun-1] = tempVar
        wksMetricSave[0, clusterRun-1] = np.log(tempVar)
        # ref data set 
        bRef = 10
        BWkbs = np.zeros((1, bRef))
        for iRun in xrange(bRef):
            refData = np.zeros_like(dataToCluster) 
            for dataRun in xrange(dataToCluster.shape[0]):
                refData[dataRun,:] = np.array([np.random.uniform(xminData ,xmaxData), np.random.uniform(yminData, ymaxData)])
            centroidsRef, labelsRef, inertiaRef = k_means(refData, n_clusters = clusterRun)
            BWkbs[0, iRun] = np.log(calculateWk(centroidsRef, labelsRef, refData))
        wkbsMetricSave[0, clusterRun-1] = np.sum(BWkbs)/float(bRef)
        skMetricSave[0, clusterRun-1] = np.sqrt(np.sum((BWkbs - wkbsMetricSave[0, clusterRun-1])**2)/float(bRef))
    skMetricSave = skMetricSave*np.sqrt(1 + 1/float(bRef))
    # gap statistics
    gap = (wkbsMetricSave - wksMetricSave)
    gap= gap.reshape(1, -1)
    finalMetric = np.zeros((1, numOfClusterRuns))
    for iRun in xrange(1, maxCluster-1):
        # gapofk-gapofk+1-sk
        finalMetric[0, iRun-1] = gap[0, iRun-1] - (gap[0, iRun] - skMetricSave[0, iRun])
        indeNonZero = np.where(finalMetric>0)[1]
    selectIndex = np.min(indeNonZero)
    # final clustering pics
    selectCenteroids =  np.array(centeroidSave[selectIndex])
    selectLabels = np.array(labelSave[selectIndex])
    return selectCenteroids
Beispiel #4
0
def test_faith():
    """The 1st part of project3"""
    data_1 = xlrd.open_workbook(r'G:\pyproj\EE511Proj3\oldfaithful.xlsx')    #Change the directory to file directory
    table = data_1.sheet_by_name(u'Sheet1')
    x = table.col_values(0)
    y = table.col_values(1)
    c = []
    for i in range(0,len(x)):
        c.append(table.row_values(i))
    print(c)
    [centroid,label,inertia] = cluster.k_means(c,2)
    print(centroid)
    print(label)
    print(inertia)
    for a in range(0,len(label)):
        plt.scatter(x[a],y[a],c = 'b')
    plt.xlabel('eruptions')
    plt.ylabel('waiting')
    plt.title('Raw Data')
    plt.show()
    for j in range(0,len(label)):
        if label[j] == 1:
            plt.scatter(x[j],y[j],marker = '.',c = 'r')
        elif label[j] == 0:
            plt.scatter(x[j],y[j],marker = '*',c = 'b')
    plt.xlabel('eruptions')
    plt.ylabel('waiting')
    plt.title('Clustering of Data')
    plt.show()
    plt.clf()
Beispiel #5
0
def check_cluster(cluster):
    n = len(cluster)
    if n < 2:
        return True, []

    # Run k_means on two centers
    children, labels, _ = k_means(cluster, 2)

    # Let v = c1 - c2 be a d-dimensional vector that connects the two centers. This is the direction that k-means
    # believes to be important for clustering.
    v = children[1]-children[0]

    # Then project X onto v: x'i = hxi, vi/||v||2. X0 is a 1-dimensional
    # representation of the data projected onto v.
    x_prime = [np.dot(point, v) for point in cluster]

    # Transform X0 so that it has mean 0 and variance 1.
    x_prime = zscore(x_prime)

    # Let zi = F(x0(i)). If A2*(Z) is in the range of non-critical values at confidence level alpha, then accept H0,
    # keep the original center, and discard {c1, c2}. Otherwise, reject H0 and keep {c1, c2} in place of the original
    # center.
    a2, critical, sig = anderson(x_prime)
    a2 *= (1+4.0/n-25.0/(n**2))

    return a2 < critical[0], children
Beispiel #6
0
def k_means_classifier(image):
        n_clusters = 8

        # blur and take local maxima
        blur_image = gaussian(image, sigma=8)
        blur_image = ndi.maximum_filter(blur_image, size=3)

        # get texture features
        feats = local_binary_pattern(blur_image, P=40, R=5, method="uniform")
        feats_r = feats.reshape(-1, 1)

        # cluster the texture features
        km = k_means(n_clusters=n_clusters, batch_size=500)
        clus = km.fit(feats_r)

        # copy relevant attributes
        labels = clus.labels_
        clusters = clus.cluster_centers_

        # reshape label arrays
        labels = labels.reshape(blur_image.shape[0], blur_image.shape[1])

        # segment shadow
        img = blur_image.ravel()
        shadow_seg = img.copy()
        for i in range(0, n_clusters):
            # set up array of pixel indices matching cluster
            mask = np.nonzero((labels.ravel() == i) == True)[0]
            if len(mask) > 0:
                thresh = threshold_otsu(img[mask])
                shadow_seg[mask] = shadow_seg[mask] < thresh
        shadow_seg = shadow_seg.reshape(*image.shape)

        return shadow_seg
Beispiel #7
0
def cpie_mech_dy(turn):
    km = k_means(np.array([turn]).T, 2, random_state=1234)[0].flatten()
    km = np.sort(np.append(km, km.mean()))
    mech = np.zeros_like(turn)
    mech[((turn > km[0]) & (turn <= km[1]))] = 1
    mech[(turn > km[2])] = 1
    return mech
Beispiel #8
0
def feature_clust(f_pool, f_train, n_clust, method='unsupervised-spectral'):
    N_pool = len(f_pool)
    data_f_pool = list(f_pool)
    data_f_train = list(f_train)

    data_f_pool.extend(data_f_train)

    if method == 'unsupervised-ds-svm':
        labels = ds_svm_clustering(data_f_pool,
                                   n_clust=n_clust,
                                   eta=4,
                                   ds_ratio=0.25,
                                   plot=False,
                                   metric='euclidean')
    elif method == 'unsupervised-spectral':
        spectral = cl.SpectralClustering(n_clusters=n_clust,
                                         eigen_solver='arpack',
                                         affinity="nearest_neighbors",
                                         n_jobs=6)
        spectral.fit(data_f_pool)
        labels = spectral.labels_
    elif method == 'unsupervised-kmeans':
        clusters = cl.k_means(data_f_pool, n_clust)  #Kmeans Clustering
        labels = clusters[1]
    elif method == 'unsupervised-kmedoids':
        _, labels = k_medoids_selection(data_f_pool, n_clust)
    else:
        raise ValueError('Invalid clustering method!')

    clust_pool = labels[0:N_pool]
    clust_train = labels[N_pool:]

    return clust_pool, clust_train
def regularized_spectral_clustering(adj_matrix, tau, n_clusters, algo='scan'):
    """
    :param adj_matrix: adjacency matrix representation of graph where [m][n] >0 if there is edge and [m][n] = weight
    :param n_clusters: cluster partitioning constant
    :param algo: the clustering separation algorithm, possible value kmeans++ or scan
    :return: labels, number of clustering iterations needed, smallest set of cluster found, execution time
    """
    start = timer()
    regularized_laplacian = regularized_laplacian_matrix(adj_matrix, tau)
    eigen_values, eigen_vectors = eigen_solver(regularized_laplacian, n_clusters=n_clusters)
    if algo == 'kmeans++':
        _, labels, _, num_iterations = k_means(eigen_vectors,
                                               n_clusters=n_clusters,
                                               return_n_iter=True)
    else:
        if n_clusters == 2:  # cluster based on sign
            second_eigen_vector_index = np.argsort(eigen_values)[1]
            second_eigen_vector = eigen_vectors.T[second_eigen_vector_index]
            labels = [0 if val <= 0 else 1 for val in second_eigen_vector]  # use only the second eigenvector
            num_iterations = 1
        else:  # bisecting it into k-ways, use all eigenvectors
            labels = discretize(eigen_vectors)
            num_iterations = 20  # assume worst case scenario that it tooks 20 restarts
    end = timer()
    execution_time = end - start
    smallest_cluster_size = min(np.sum(labels), abs(np.sum(labels) - len(labels)))
    return labels, num_iterations, smallest_cluster_size, execution_time
Beispiel #10
0
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X,
                                                   n_clusters=n_clusters,
                                                   sample_weight=None,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert centers.shape == (n_clusters, n_features)

    assert np.unique(labels).shape[0] == n_clusters

    # check that the labels assignment are perfect (up to a permutation)
    assert v_measure_score(true_labels, labels) == 1.0
    assert inertia > 0.0

    # check warning when centers are passed
    assert_warns(RuntimeWarning,
                 k_means,
                 X,
                 n_clusters=n_clusters,
                 sample_weight=None,
                 init=centers)
Beispiel #11
0
def spectral_clustering(road_map=None, a=None, use_ncut=False, num_clusters=2):
    print("Building adjacency matrix")
    if(a==None):
        a = build_adjacency_matrix(road_map)
    
    print("Computing laplacian")
    l = build_laplacian(a, normalize=use_ncut)
    
    print("Spectral embedding")
    #e_vals, e_vects = eigsh(l, k=num_clusters, which='SM', tol=0.01, sigma=2.01)
    X = np.random.rand(l.shape[0], num_clusters+1)
    e_vals, e_vects = lobpcg(l, X, tol=1e-15,
                                            largest=False, maxiter=2000)    
    
    
    
    embedded_data = e_vects[:,1:]
    
    print e_vals
    
    

    print("Clustering")
    centroid, label, intertia = k_means(embedded_data, num_clusters)
    
    for i in xrange(len(label)):
        road_map.nodes[i].region_id = label[i]
Beispiel #12
0
def clusters(n = 100):
    from sklearn.cluster import k_means
    import numpy as np
    import matplotlib.pyplot as plt
    from mpl_toolkits.basemap import Basemap
    from matplotlib.patches import Polygon
    from matplotlib.collections import PatchCollection
    from matplotlib.patches import PathPatch
    from mapFuncts import USbasemap
    from matplotlib import cm, colors, colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    fig, ax, m = USbasemap(fig = fig, ax = ax)
    alldivDF, _, _, _, _ = seasonal_setup(season = 'MAM')
    centroid, label, inertia = k_means(alldivDF.T, n_clusters = n, init = 'k-means++')
    names = alldivDF.columns
    codes = reverseStates(importStates())
    cmap = cm.Accent
    norm = colors.Normalize(vmin = 0, vmax = n, clip = True)
    mapper = cm.ScalarMappable(norm = norm, cmap = cmap)
    for cluster in range(n):
        idx = label==cluster
        patches = []
        color = mapper.to_rgba(cluster)
        for name in names[idx]:
            divcode = codes[name[:-3]]+name[-2:]
            print name, divcode
            for info, shape in zip(m.divs_info, m.divs):
                if info['CLIMDIV']==int(divcode):
                    patches.append(Polygon(np.array(shape),True))
        ax.add_collection(PatchCollection(patches, facecolor = color,  edgecolor='k', linewidths=1., zorder=2))
    return fig, ax
Beispiel #13
0
    def fit(
        self,
        data: List[Iterator[float]],
        find_n: bool = False
    ) -> Dict[str, Union[List[int], Union[float, None]]]:
        """Cluster the input data into n clusters.

        Args:
            data: A list of vectors.
            find_n: If True, don't use self.n_cluster but find n using
                elbow analysis instead
        Return:
            A list of integers as class labels. The order of the list
            corresponds to the order of the input data.
        """
        if find_n:
            self.n_clusters = 5  # self._get_n()
        if self.clus_type == 'kmeans':
            self.cluster = k_means(n_clusters=self.n_clusters)
        elif self.clus_type == 'sphericalkmeans':
            self.cluster = SphericalKMeans(n_clusters=self.n_clusters)
        elif self.clus_type == 'agglomerative':
            self.cluster = AgglomerativeClustering(n_clusters=self.n_clusters,
                                                   affinity=self.affinity,
                                                   linkage=self.linkage)

        self.cluster.fit(data)
        self._calc_density()

        return {'labels': self.cluster.labels_, 'density': self.compactness}
Beispiel #14
0
def sweep_clusters(args):
    data = joblib.load(args.model_file)
    projected = data[PROJECTION_KEY]

    print "Model type", data[MODEL_TYPE_KEY]

    if not os.path.exists(args.figures_dir):
        os.makedirs(args.figures_dir)

    inertia_values = []
    for k in args.n_clusters:
        print "Clustering with %s states" % k
        _, _, inertia = k_means(projected[:, args.dimensions],
                                k,
                                n_jobs=-2)
        inertia_values.append(inertia)

    plt.plot(args.n_clusters,
             inertia_values,
             "k.-")
    plt.xlabel("Number of Clusters", fontsize=16)
    plt.ylabel("Inertia", fontsize=16)

    fig_flname = os.path.join(args.figures_dir,
                              "cluster_inertia")
    for dim in args.dimensions:
        fig_flname += "_%s" % dim
    fig_flname += ".png"

    plt.savefig(fig_flname,
                DPI=300)
Beispiel #15
0
def spectral_clustering(affinity,
                        n_clusters=8,
                        n_components=None,
                        eigen_solver=None,
                        random_state=None,
                        n_init=10,
                        eigen_tol=0.0,
                        assign_labels='kmeans'):

    if assign_labels not in ('kmeans', 'discretize',
                             'AgglomerativeClustering'):
        raise ValueError("The 'assign_labels' parameter should be "
                         "'kmeans' or 'discretize', but '%s' was given" %
                         assign_labels)

    random_state = check_random_state(random_state)
    n_components = n_clusters if n_components is None else n_components
    maps = spectral_embedding(affinity,
                              n_components=n_components,
                              eigen_solver=eigen_solver,
                              random_state=random_state,
                              eigen_tol=eigen_tol,
                              drop_first=False)
    if assign_labels == 'kmeans':
        _, labels, _ = k_means(maps, n_clusters)
    else:
        labels = discretize(maps, random_state=random_state)
    return labels
Beispiel #16
0
def clust(X,vectorfile,clust_file,clust_number):

    fid2fname = {}
    for line in open(vectorfile) :
        line = line.strip().split('\t')
        fid2fname.setdefault(int(line[0]), line[1:])

    # Force the solver to be arpack, since amg is numerically
    # unstable on this example
    # labels = spectral_clustering(graph, n_clusters=160, eigen_solver='arpack')

    a, labels,inertia = cluster.k_means(X,n_clusters=clust_number)

    print ("inertia=",inertia)
    C=np.column_stack((X,labels))

    easy_data=open(clust_file, 'w')
    for pnt1 in C :
        strx=""
        for charest in pnt1:
            strx+=str(int(charest))+"\t"
        print >> easy_data, strx
    easy_data.close()

    return inertia
Beispiel #17
0
def kmeans(data, depth):
    original = data
    data = data[data.PRES < depth]
    if data[data.DATA_MODE != "D"].shape[0] > 0 and data[data.DATA_MODE ==
                                                         "D"].shape[0] > 0:
        # Get mid-range of DMQC and RTQC data
        rm_d = (data[data.DATA_MODE == "D"].PSAL.max() +
                data[data.DATA_MODE == "D"].PSAL.min()) / 2
        rm_r = (data[data.DATA_MODE != "D"].PSAL.max() +
                data[data.DATA_MODE != "D"].PSAL.min()) / 2
        # K means algorithm
        init = array([[rm_d], [rm_r]])
        centroid, labels, loss = k_means(data[["PSAL"]],
                                         2,
                                         init=init,
                                         n_init=1)
    else:
        return (False, original, [], [])

    data.insert(data.shape[1], "LABEL", labels)
    # Get data from the second group
    gruped = data[data.LABEL == 1].groupby(["PLATFORM_NUMBER", "CYCLE_NUMBER"
                                            ]).size().reset_index()
    platform_numbers = gruped.PLATFORM_NUMBER.unique().tolist()
    temp_data = original[
        (original.PLATFORM_NUMBER.isin(gruped.PLATFORM_NUMBER))
        & (original.CYCLE_NUMBER.isin(gruped.CYCLE_NUMBER))]
    # If the second group has DMQC data, the flag is False
    flag = False if temp_data[temp_data.DATA_MODE ==
                              "D"].shape[0] > 0 else True
    # Get data from the first group
    temp_data = original[~(
        (original.PLATFORM_NUMBER.isin(gruped.PLATFORM_NUMBER)) &
        (original.CYCLE_NUMBER.isin(gruped.CYCLE_NUMBER)))]
    return (flag, original, temp_data, platform_numbers)
Beispiel #18
0
    def kmeans_cluster(self, data, n_clusters, maximum):
        """
		Calculate K-means and plot
		"""
        # Make clusters
        from sklearn.cluster import k_means
        import numpy as np
        init = np.array([[1.25, 0.28], [4.29, 0.71], [0.99, 1.15],
                         [6.5, 1.011]])
        centroids, labels, sse = k_means(data,
                                         n_clusters=n_clusters,
                                         init=init,
                                         n_init=100)
        # Plot
        from matplotlib import pyplot as plt
        plt.figure(figsize=(10, 7))
        # Define color map
        #cmap = 'jet'
        cmap = self.color_map('kmeans')
        # De-normalize before plot
        plt.scatter(data[:, 0] * maximum, data[:, 1], c=labels, cmap=cmap)
        plt.title('Clusters - k-means')
        plt.xlabel('var0')
        plt.ylabel('var1')
        plt.savefig('clusters_kmeans.png')
        plt.close()
        return ()
Beispiel #19
0
def draw_kmeans(item):

    print 'dataset:', items[item]
    train_data = tools.data_pre(item)
    [centroid, label, inertia] = cluster.k_means(train_data, cluster_k)

    #sava data into the csv files
    root_path = root_dir + 'data\kmeans' + os.sep + items[item] + os.sep
    print 'result path:', root_path
    if not os.path.isdir(root_path):
        os.makedirs(root_path)
        time.sleep(2)

    label_path = root_path + items[item] + '_label.csv'
    count_path = root_path + items[item] + '_count.csv'
    cent_path = root_path + items[item] + '_cent.csv'

    np.savetxt(label_path, label, delimiter=',')
    pd.value_counts(label).T.to_csv(count_path)  # value_counts()计算非空值计数的直方图。
    df = pd.DataFrame(centroid)  # T 转置; to_csv()将数据写入csv文件
    df.to_csv(cent_path, float_format='%.5f')

    index = 1
    for i in centroid:
        plt.figure(index)
        title_name = items[item] + '_kmeans_' + str(index)  #标题名称
        plt.title(title_name)
        pd.Series(i).plot()  #画图
        path = root_dir + '\data\kmeans\img' + items[item] + '_kmeans_' + str(
            index) + '.png'
        plt.savefig(path)  #保存
        index += 1
def sklearn_spectral_clustering(adj_matrix, n_clusters):
    """
    :param adj_matrix: adjacency matrix representation of graph where [m][n] >0 if there is edge and [m][n] = weight
    :param n_clusters: cluster partitioning constant
    :return: labels, number of clustering iterations needed, smallest set of cluster found, execution time
    """
    start = timer()
    connectivity = kneighbors_graph(adj_matrix, n_neighbors=10,
                                    include_self=True)
    affinity_matrix_ = 0.5 * (connectivity + connectivity.T)

    eigen_vectors = spectral_embedding(affinity_matrix_,
                                       n_components=n_clusters,
                                       eigen_solver='arpack',
                                       eigen_tol=0.0,
                                       norm_laplacian=True,
                                       drop_first=False)

    _, labels, _, num_iterations = k_means(eigen_vectors,
                                           n_clusters=n_clusters,
                                           return_n_iter=True)

    end = timer()
    execution_time = end - start
    smallest_cluster_size = min(np.sum(labels), abs(np.sum(labels) - labels.size))
    return labels, num_iterations, smallest_cluster_size, execution_time
Beispiel #21
0
def fun(train, test):
    granular_balls = GBList(train, train)  # Build granular balls
    granular_balls.init_granular_balls()  # Initialize granular balls, divide granular balls according to purity
    ball_list = granular_balls.granular_balls
    Ball_list = funtion(ball_list)  # Continue to divide the granular balls with overlapping boundaries
    while True:
        init_center = []
        Ball_num1 = len(Ball_list)  # Count the number of granular balls
        for i in range(len(Ball_list)):
            init_center.append(Ball_list[i].center)

        ClusterLists = k_means(X=train[:, 1:-1], init=np.array(init_center), n_clusters=len(Ball_list))
        data_label = ClusterLists[1]
        ball_list = []
        for i in set(data_label):
            Cluster_data = train[data_label == i, :]
            ball_list.append(GranularBall(Cluster_data))
        Ball_list = funtion(ball_list)
        Ball_num2 = len(Ball_list)  # Number of granular balls after statistical division
        if Ball_num1 == Ball_num2:  # Stop if the number of granular balls no longer changes
            break
    # plot_gb(Ball_list)  # Visualize two-dimensional granular balls (using data.csv data set)
    ball_num = len(Ball_list)  # Number of granular balls generated
    time1 = time.time()
    count_num = nearest_knn(Ball_list, test)  # Test nearest neighbor accuracy
    time2 = time.time()
    test_time = time2 - time1  # Statistical classification time

    return count_num, ball_num, test_time
Beispiel #22
0
def testGeneratedData():
    X, y = make_blobs(n_samples=1000, centers=np.array([[-2.5, 0], [2.5, 0], [0, 2.5], [0, -2.5]]))

    n_clusters = np.unique(y).shape[0]
    n_instances = X.shape[0]

    # Delete some seeds:
    yMod = np.copy(y)
    for i in range(n_instances):
        if np.random.rand() > 0.5:
            yMod[i] = -1 # Means no class

    sm = ConstrainedKMeans(n_clusters=n_clusters, max_iter=2000, verbose=1)
    sm.fit(X, yMod)
    predictedLabels = sm.predict(X)
    trueLabels = y
    adjustedRandScore = metrics.adjusted_rand_score(trueLabels, predictedLabels)
    print('Constrained KMeans adjusted rand score: %s' % (adjustedRandScore))

    (centers, predictedLabels, inertia, best_n_iter) = k_means(X, n_clusters, n_init=1, return_n_iter=True)
    print('Number of iterations: %s' % (best_n_iter))
    # Find the correct predictions, the permutations that maximizes the accuracy:
    # predictedLabels = kmeans.labels_
    trueLabels = y
    
    adjustedRandScore = metrics.adjusted_rand_score(trueLabels, predictedLabels)
    print('KMeans adjusted rand score: %s' % (adjustedRandScore))

    tColour = tuple([0, 1, 0])
    plt.scatter(X[y == predictedLabels, 0], X[y == predictedLabels, 1], c=tColour, alpha=0.5)
    tColour = tuple([1, 0, 0])
    plt.scatter(X[y != predictedLabels, 0], X[y != predictedLabels, 1], c=tColour, alpha=0.5)
    plt.show()
def KMeansCluster(matrix):
    """
    Performs the K-Means cluster given a matrix of data
    @param[in]: matrix, List of List(s)
    """

    # Possibly need to scale the data first
    data = scale(matrix)

    # Approximate the number of clusters using c = root(n/2)
    # num_clusters = int(sqrt(len(matrix) / 2))
    num_clusters = 5
    number_init = 10 # Default
    number_iter = 300
    num_cpus = 2

    print "==================="
    print "Training KMeans with (num_clusters, num_init, num_iters, num_cpus)"
    print num_clusters, number_init, number_iter, num_cpus

    # estimator = KMeans(init='k-means++', n_clusters = num_clusters, n_init = number_init)
    # estimator.fit(data)
    # clusters = k_means(data, n_clusters = num_clusters, max_iter=number_iter, n_init = number_iter, 
    #     init='k-means++', n_jobs = num_cpus)
    clusters = k_means(data, n_clusters = num_clusters, max_iter=number_iter, n_init = number_iter, n_jobs = num_cpus)


    return clusters
Beispiel #24
0
def kmeans_estimate(x):
    labels = k_means(x, 2)[1]
    # print labels
    cluster = [[], []]
    for i in xrange(len(labels)):
        cluster[labels[i]].append(x[i])
    cluster[0] = np.array(cluster[0])
    cluster[1] = np.array(cluster[1])
    pi = len(cluster[0]) * 1.0 / len(labels)
    pis = [pi, 1 - pi]
    means = []
    sigmas = []
    for i in xrange(2):
        curr = cluster[i]
        mean = np.average(curr, axis=0)
        means.append(mean)

        sigma = np.zeros((2, 2))
        for i in xrange(len(curr)):
            y = np.reshape(curr[i] - mean, (2, 1))
            sigma += np.dot(y, y.T)
        sigma /= len(curr)
        sigmas.append(sigma)

    pis = np.array(pis)
    means = np.array(means)
    sigmas = np.array(sigmas)
    print means
    return pis, means, sigmas
Beispiel #25
0
def clusterValidity(X, y):
    # Maximum number of clusters:
    K = 10

    # Allocate variables:
    Rand = np.zeros((K, ))
    Jaccard = np.zeros((K, ))
    NMI = np.zeros((K, ))

    for k in range(K):
        # run K-means clustering:
        #cls = Pycluster.kcluster(X,k+1)[0]
        centroids, cls, inertia = k_means(X, k + 1)
        # compute cluster validities:
        Rand[k], Jaccard[k], NMI[k] = clusterval(y, cls)

    # Plot results:

    figure(1)
    title('Cluster validity')
    plot(np.arange(K) + 1, Rand)
    plot(np.arange(K) + 1, Jaccard)
    plot(np.arange(K) + 1, NMI)
    ylim(-2, 1.1)
    legend(['Rand', 'Jaccard', 'NMI'], loc=4)
    show()
Beispiel #26
0
def main():
    import matplotlib.pyplot as plt
    from sklearn.datasets.samples_generator import make_blobs
    n_centers = 3
    X, y = make_blobs(n_samples=1000, centers=n_centers, n_features=2,
                    cluster_std=0.7, random_state=0)

    # Run this K-Means
    import kmeans
    t0 = time.time()
    y_pred, centers, obj_val_seq = kmeans.kmeans(X, n_centers)
    t1 = time.time()
    print("Final obj val: {}".format(obj_val_seq[-1]))
    print("Time taken (this implementation): {}".format(t1 - t0))

    # Run scikit-learn's K-Means
    from sklearn.cluster import k_means
    t0 = time.time()
    centers, y_pred, obj_val = k_means(X, n_centers, random_state=0)
    t1 = time.time()
    print("Final obj val: {}".format(obj_val))
    print("Time taken (Scikit, 1 job): {}".format(t1 - t0))

    # Plot change in objective value over iteration
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(obj_val_seq, 'b-', marker='*')
    fig.suptitle("Change in K-means objective value across iterations")
    ax.set_xlabel("Iteration")
    ax.set_ylabel("Objective value")
    fig.show()

    # Plot data
    from itertools import cycle
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    fig = plt.figure(figsize=plt.figaspect(0.5))  # Make twice as wide to accomodate both plots
    ax = fig.add_subplot(121)
    ax.set_title("Data with true labels and final centers")
    for k, color in zip(range(n_centers), colors):
        ax.plot(X[y==k, 0], X[y==k, 1], color + '.')

    initial_centers = kmeans.init_centers(X, n_centers, 2) # This is valid because we always use the same random seed.
    # Plot initial centers
    for x in initial_centers:
        ax.plot(x[0], x[1], "mo", markeredgecolor="k", markersize=8)

    # Plot final centers
    for x in centers:
        ax.plot(x[0], x[1], "co", markeredgecolor="k", markersize=8)

    # Plot assignments
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    ax = fig.add_subplot(122)
    ax.set_title("Data with final assignments")
    for k, color in zip(range(n_centers), colors):
        ax.plot(X[y_pred==k, 0], X[y_pred==k, 1], color + '.')

    fig.tight_layout()
    fig.gca()
    fig.show()
Beispiel #27
0
    def DrawSsePlot(self, title, data, random_state):
        chart_folder = self.dir_folder + title + ".png"
        chart_folder_re = self.upload_folder + title + ".png"
        sse = []  # 手肘法则
        lunkuo = []  # 轮廓系数存放距离
        start, end = 3, 15
        for i in range(start, end):
            km = k_means(data, n_clusters=i, random_state=random_state)
            sse.append(km[2])
            lunkuo.append(silhouette_score(data, km[1], metric='euclidean'))
        fig, ax1 = plt.subplots(figsize=(10, 7))
        ax2 = ax1.twinx()
        lns1 = ax1.plot(range(start, end), sse, 'o-', c='g', label='zhou-bu')
        lns2 = ax2.plot(range(start, end),
                        lunkuo,
                        'o-',
                        c='r',
                        label='lun-kuo')
        new_ticks = np.linspace(start, end, end - start + 1)
        plt.xticks(new_ticks)
        lns = lns1 + lns2
        labs = [l.get_label() for l in lns]
        ax1.legend(lns, labs, loc=0)
        ax1.set_xlabel('K')
        ax1.set_ylabel('SSE')
        ax2.set_ylabel('LUN-KUO-INDEX')

        plt.savefig(chart_folder)
        plt.savefig(chart_folder_re)
        plt.cla()
        plt.clf()
        return chart_folder
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
                                                   sample_weight=None,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(inertia, 0.0)

    # check warning when centers are passed
    assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters,
                 sample_weight=None, init=centers)

    # to many clusters desired
    assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1,
                  sample_weight=None)

    # kmeans for algorithm='elkan' raises TypeError on sparse matrix
    assert_raise_message(TypeError, "algorithm='elkan' not supported for "
                         "sparse input X", k_means, X=X_csr, n_clusters=2,
                         sample_weight=None, algorithm="elkan")
Beispiel #29
0
def load_semantic_map():
    target_mask_img = img_to_array(
        load_img(target_mask_path, target_size=(img_nrows, img_ncols)))
    style_mask_img = img_to_array(
        load_img(style_mask_path, target_size=(img_nrows, img_ncols)))

    if K.image_dim_ordering() == 'th':
        mask_vecs = np.vstack([
            style_mask_img.reshape((3, -1)).T,
            target_mask_img.reshape((3, -1)).T
        ])
    else:
        mask_vecs = np.vstack([
            style_mask_img.reshape((-1, 3)),
            target_mask_img.reshape((-1, 3))
        ])

    #_, labels = vq.kmeans2(mask_vecs, nb_labels, missing='raise')
    _, labels, _ = k_means(mask_vecs.astype("float64"), nb_labels)

    style_mask_label = labels[:img_nrows * img_ncols].reshape(
        (img_nrows, img_ncols))
    target_mask_label = labels[img_nrows * img_ncols:].reshape(
        (img_nrows, img_ncols))
    style_mask = np.stack([style_mask_label == r for r in range(nb_labels)],
                          axis=0)
    target_mask = np.stack([target_mask_label == r for r in range(nb_labels)],
                           axis=0)
    return np.expand_dims(style_mask, axis=0), np.expand_dims(target_mask,
                                                              axis=0)
Beispiel #30
0
    def generate_cluster(self):
        for i in range(self.size):
            cod = self.get_coordinate()
            self.G.add_node(i, pos=cod)

        for i in range(len(self.ls)):
            self.dic[i] = []
            for j in range(len(self.ls)):
                val = self.get_dist(self.ls[i], self.ls[j])
                self.dic[i].append(round(val))

        self.cluster = DBSCAN(eps=4, min_samples=1).fit_predict(np.array(self.ls))
        n_clus = max(self.cluster) + 1
        temp = k_means(np.array(self.ls), n_clusters=n_clus)

        self.centroid = temp[0]
        self.cluster = temp[1]

        for i in range(len(self.centroid)):
            for j in range(2):
                self.centroid[i][j] = int(round(self.centroid[i][j]))

        for i in range(max(self.cluster) + 1):
            self.final[i] = []
            for j in range(len(self.cluster)):
                if (i == self.cluster[j]):
                    self.final[i].append(j)

        for i in range(len(self.final)):
            self.cluster_head.append(self.final[i][self.getClusterHead(self.final[i], self.centroid[i])])

        return self.G
    def getNPointsInRegion(self,
                           NPoints,
                           n=89,
                           s=-60,
                           e=180,
                           w=-180,
                           fidelity=1):
        '''

        :param NPoints: Number of coordinates you want to find
        :param n: Northern Latitude
        :param s: Southern Latitude
        :param e: Eastern Longitude
        :param w: Western Longitude
        :param fidelity: granularity of the coordinate grid returned (smaller means more points)
        :return: Pandas DataFrame of the points found to be land,
                Pandas DataFrame the equidistant coordinates within this region
        '''
        globe_m = self.getPointsBetween(n, s, e, w, fidelity)
        land = np.squeeze(np.asarray(globe_m[:, 2] == 1))
        land_m = globe_m[land]

        myCL = cluster.k_means(land_m, NPoints)
        centroids = myCL[0]

        return pd.DataFrame(land_m).iloc[:, 0:2], pd.DataFrame(
            centroids).iloc[:, 0:2]
Beispiel #32
0
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(inertia, 0.0)

    # check warning when centers are passed
    assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters,
                 init=centers)

    # to many clusters desired
    assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
Beispiel #33
0
def smythEmissionDistribution(pair):
	"""
	Given a pair (S: list of sequences, target_m: int), get the emission
	distribution for Smyth's "default" HMM. target_m is an upper bound on the
	number of states -- if we can only have m' distinct observation values, then
	the distribution for a m' state HMM is returned.

	@param pair: A tuple of the form (S: list of sequences, m: int)
	@return: The corresponding emission distribution encoded as a list
		of (mu, stddev) pairs
	"""
	S, target_m = pair
	merged, distinct = prepareSeqs(S)
	m_prime = min(target_m, len(distinct))
	centroids, labels, inertia = k_means(merged, m_prime, init='k-means++')
	clusters = partition(merged, labels)
	B = []
	has_zero = False
	for cluster in clusters:
		assert len(cluster) > 0
		mu = mean(cluster)
		stddev = std(cluster)
		B.append((mu, stddev))
		if stddev < 0.001:
			has_zero = True
	return (B, labels, has_zero)
Beispiel #34
0
    def get_jackknife_randoms(N_jack,
                              catalog_data,
                              generate_randoms,
                              ra='ra',
                              dec='dec'):
        """
        Computes the jackknife regions and random catalogs for each region 
        Parameters
        ----------
        N_jack : number of regions
        catalog_data : input catalog
        generate_randoms: function to generate randoms (eg self.generate_processed_randoms)

        Returns
        -------
        jack_labels: array of regions in catalog data
        randoms: dict of randoms labeled by region
        """
        #cluster
        nn = np.stack((catalog_data[ra], catalog_data[dec]), axis=1)
        _, jack_labels, _ = k_means(n_clusters=N_jack, random_state=0, X=nn)

        randoms = {}
        for nj in range(N_jack):
            catalog_data_jk = dict(
                zip(catalog_data.keys(),
                    [v[(jack_labels != nj)] for v in catalog_data.values()]))
            rand_cat, rr = generate_randoms(
                catalog_data_jk)  #get randoms for this footprint
            randoms[str(nj)] = {'ran': rand_cat, 'rr': rr}

        return jack_labels, randoms
def mosquito_init(sample_space, dim_count, num_particles, num_swarms=5):

    if num_swarms > num_particles:
        num_swarms = num_particles

    low_bound, high_bound = sample_space

    mosquitos = np.random.uniform(low_bound,
                                  high_bound,
                                  size=(num_particles, dim_count))

    #leaders, swarms = kmeans(mosquitos, num_centroids=num_swarms)
    #print swarms
    #print

    res = k_means(mosquitos, num_swarms)
    swarm_idx = res[1]
    swarms = [[] for _ in xrange(num_swarms)]
    for idx in xrange(len(swarm_idx)):
        swarms[swarm_idx[idx]].append(mosquitos[idx])

    # print swarms
    # exit(0)
    starvation = [0.0 for _ in xrange(num_swarms)]
    return [(np.inf, None)
            for _ in xrange(num_swarms)], swarms, starvation, num_particles
Beispiel #36
0
def determine_k(item):
    print ''
    nums = 10  #平均轮廓系数计算次数
    max_k = 10
    min_k = 2
    result = [0] * (max_k - min_k)
    data = data_pre(item)
    for j in range(nums):
        scs = []
        for i in range(min_k, max_k):
            [centroid, label, inertia] = cluster.k_means(data, i)
            sc = silhouette_score(data, label, metric='euclidean')  # 平均轮廓系数
            scs.append(sc)
            result[i - min_k] += sc

    for i in range(len(scs)):
        result[i] = result[i] / nums
    temp = pd.DataFrame(result)
    temp.to_csv('data/Silhouette' + str(item) + '.csv',
                index=False)  #index设置是否显示行号

    plt.plot(result, 'rx-')
    plt.title('scs' + str(nums))
    plt.xlabel('the numbers of clusters')
    plt.ylabel('Silhouette Coefficient')
    plt.savefig('img/Silhouette Coefficient.png')
    #plt.show()
    plt.close()
Beispiel #37
0
def example_4():
	"""
	compare to scikitlearn implementation of kmeans
	"""

	import sklearn.cluster as skc
	import time
	
	ndata = 50000
	dimension = 10
	ncentroids = 1000
	data = npr.randn(ndata, dimension).astype(np.float64)

	centroids0 = data[0:ncentroids, :]

	t0 = time.time()
	kmeans.get_clustering(X = data, init = centroids0, n_clusters = ncentroids, algorithm = 'auto', verbose = 1, n_threads = 1)
	t1 = time.time()

	sklearner = skc.k_means(X = data, n_clusters = ncentroids, max_iter = 1000, n_init = 1, init = centroids0, precompute_distances = False, verbose = True, n_jobs = 1, return_n_iter = True, tol = 0.0)
	t2 = time.time()	
	
	kmeans_time = t1 - t0
	sklearner_time = t2 - t1
	
	print "sklearn : ", sklearner_time, " s"
	print "this kmeans: ",  kmeans_time, " s"
Beispiel #38
0
def cluster_samples(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projected = model[PROJECTION_KEY]
    components = map(lambda idx: idx - 1, args.components)
    selected = projected[:, components]

    features = read_features(workdir)

    _, labels, inertia = k_means(selected, args.n_clusters, n_jobs=-2)

    fig_flname = os.path.join(analysis_dir,
                              "clusters_%s.tsv" % args.n_clusters)

    clusters = defaultdict(list)
    for name, cluster in zip(features.sample_labels, labels):
        clusters[cluster].append(name)

    with open(fig_flname, "w") as fl:
        for cluster, samples in clusters.iteritems():
            fl.write(str(cluster))
            fl.write(",")
            fl.write(",".join(samples))
            fl.write("\n")
Beispiel #39
0
def sweep_clusters(args):
    workdir = args.workdir

    figures_dir = os.path.join(workdir, "figures")
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projected = model[PROJECTION_KEY]
    components = map(lambda idx: idx - 1, args.components)
    selected = projected[:, components]

    features = read_features(workdir)

    inertia_values = []
    for k in args.n_clusters:
        print "Clustering with %s clusters" % k
        _, _, inertia = k_means(selected, k, n_jobs=-2)
        inertia_values.append(inertia)

    plt.plot(args.n_clusters, inertia_values, "k.-")
    plt.xlabel("Number of Clusters", fontsize=16)
    plt.ylabel("Inertia", fontsize=16)

    fig_flname = os.path.join(figures_dir, "cluster_inertia")
    for dim in args.components:
        fig_flname += "_%s" % dim
    fig_flname += ".png"

    plt.savefig(fig_flname, DPI=300)
Beispiel #40
0
    def cluster_GSOM(self, gsom_map, n_clusters=2):
        """
        Parameters
        ----------
        gsom_map : growing self organizing map
            2D array of weight vectors in SOM.
        n_clusters : number of clusters.

        Returns
        -------
        gsom_list : list
            list of the gsom nodes
        centroid : list
            cluster centroids.
        labels : list
            cluster label w.r.t. gsom node data-point as in gsom_list
        """

        gsom_list = self._gsom_to_array(gsom_map)

        clf = k_means(gsom_list, n_clusters=n_clusters)

        centroids = clf[0]
        labels = clf[1]

        return gsom_list, centroids, labels
Beispiel #41
0
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
                                                   sample_weight=None,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(inertia, 0.0)

    # check warning when centers are passed
    assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters,
                 sample_weight=None, init=centers)

    # to many clusters desired
    assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1,
                  sample_weight=None)

    # kmeans for algorithm='elkan' raises TypeError on sparse matrix
    assert_raise_message(TypeError, "algorithm='elkan' not supported for "
                         "sparse input X", k_means, X=X_csr, n_clusters=2,
                         sample_weight=None, algorithm="elkan")
    def action_execute_button_clicked(self):
        #打开影像
        input_img = gdal.Open(self.input_file_path.text())
        img_rows = input_img.RasterYSize
        img_cols = input_img.RasterXSize
        img_bands = input_img.RasterCount
        img_geotrans = input_img.GetGeoTransform()
        img_proj = input_img.GetProjection()

        # 将影像转为k_means函数接受的数据格式
        input_features = []
        for i in range(1, img_bands + 1):
            band_img = input_img.GetRasterBand(i).ReadAsArray(
                0, 0, img_cols, img_rows)
            input_features.append(band_img.reshape(-1))
        input_features = np.array(input_features).T

        #执行k_means算法
        kmeans_result = k_means(input_features,
                                int(self.cluster_num.currentText()),
                                max_iter=int(self.iter_num.currentText()))

        #将各样本点灰度值转为对应聚类中心灰度值
        cluster_centers, clustered_points, _ = kmeans_result
        output_feature = []
        for index, item in enumerate(clustered_points):
            while item > len(self.color_list) - 1:
                self.color_list.append(list(np.random.randint(256, size=3)))
            output_feature.append(self.color_list[item])
        output_feature = np.array(output_feature).T
        output_feature = np.array(
            list(map(lambda x: x.reshape((img_rows, img_cols)),
                     output_feature)))

        #输出聚类影像
        driver = gdal.GetDriverByName("GTiff")
        output_img = driver.Create(self.output_file_path.text(), img_cols,
                                   img_rows, 3, gdal.GDT_Byte)
        output_img.SetGeoTransform(img_geotrans)
        output_img.SetProjection(img_proj)
        for i in range(1, 4):
            output_img.GetRasterBand(i).WriteArray(output_feature[i - 1])
        del output_img

        layer_legends = []  # 图例数组
        for i in range(len(cluster_centers)):
            layer_legends.append({
                'name':
                'Cluster' + str(i + 1),
                'color':
                QColor(self.color_list[i][0], self.color_list[i][1],
                       self.color_list[i][2])
            })

        if (QMessageBox.question(self, "消息框", "聚类完成,是否将结果添加到图层?",
                                 QMessageBox.Yes | QMessageBox.No,
                                 QMessageBox.Yes) == QMessageBox.Yes):
            self.add_layer_signal.emit(self.output_file_path.text(),
                                       layer_legends)
Beispiel #43
0
def testRealData():
    sDirname = os.path.dirname(os.path.abspath(__file__))
    dfAspen = pd.read_csv(
        os.path.join(sDirname, '..', 'datasets', 'aspen.csv'), ';')
    dfAspen = dfAspen.dropna()
    dfAspen = dfAspen.reindex(np.random.permutation(dfAspen.index))

    asCities = dfAspen['address_city'].unique()
    X = dfAspen[['location_coordinates_0',
                 'location_coordinates_1']].as_matrix()
    Y = dfAspen['address_city'].as_matrix()
    Y = np.array([asCities.tolist().index(sCity) for sCity in Y
                  ])  # Convert array of strings to sequential integers
    # Convert coordinates to x,y grid using the Equirectangular projection
    X = X * np.pi / 180
    fMeanLatitude = X.mean(0)[0]
    X[:, 0] = X[:, 0] * np.cos(fMeanLatitude)
    # First x, then y:
    Xaux = np.copy(X)
    X[:, 1] = Xaux[:, 0]
    X[:, 0] = Xaux[:, 1]

    n_clusters = len(asCities)

    # K-means:
    (centers, PredictedLabels, inertia,
     best_n_iter) = k_means(X, n_clusters, n_init=1, return_n_iter=True)
    adjustedRandScore = metrics.adjusted_rand_score(Y, PredictedLabels)
    print('KMeans adjusted rand score: %s' % (adjustedRandScore))
    print('Number of iterations: %s' % (best_n_iter))

    # Seeded k-means with all seeds:
    # Drop some seeds:
    fRatio = 0
    dMapping = {}
    maxSeed = -1
    SomeSeeds = np.repeat(-1, len(Y))
    for i in range(len(Y)):
        if np.random.rand() > fRatio:
            continue
        iCity = Y[i]
        if iCity not in dMapping:
            maxSeed += 1
            dMapping[iCity] = maxSeed
        SomeSeeds[i] = dMapping[iCity]

    sm = SeededKMeans(n_clusters=n_clusters, max_iter=2000, verbose=1)
    sm.fit(X, SomeSeeds)
    PredictedLabels = sm.predict(X)
    adjustedRandScore = metrics.adjusted_rand_score(Y, PredictedLabels)
    print('Seeded KMeans adjusted rand score: %s' % (adjustedRandScore))

    for predictedLabel in np.unique(PredictedLabels):
        plt.scatter(X[PredictedLabels == predictedLabel, 0],
                    X[PredictedLabels == predictedLabel, 1],
                    color=(np.random.rand(), np.random.rand(),
                           np.random.rand()),
                    alpha=1)
    plt.show()
Beispiel #44
0
def get_point_centroids(indata,K,D):
    mean = numpy.zeros((indata.shape[1],D))
    for n in xrange(0,(indata.shape[1])):
        for i in xrange(0,(indata.shape[2])):
            for j in xrange(0,D):
                mean[n][j] = mean[n][j] + indata[j][n][i]
        mean[n] = mean[n]/(indata.shape[2])
    (centroids,x,y)=k_means(mean,K) #random order. change n_jobs to speed up
    return centroids
Beispiel #45
0
	def twoClassTrain(self,data1,data2):
		#y is a N * 1 matrix
		data = data1 + data2
		y = [[1 if d[0] == data1[0][0] else -1] for d in data]
		X = [d[1:] for d in data]
		centroid, label, inertia = cluster.k_means(X,self.k)
		phi = self.genTransferMatrix(X,centroid)
		w = numpy.dot(numpy.linalg.pinv(phi),y)
		return w, centroid
Beispiel #46
0
    def learn(self, Xtrain, ytrain):
        """ Learns using the traindata """
        Xless = Xtrain[:,self.features]
        lam = 100

        self.centers = cluster.k_means(Xless, 10)[0]
        Xless = np.dot(Xless, self.centers.T)

        self.weights = np.dot(np.dot(np.linalg.inv(np.dot(Xless.T,Xless) + lam*np.identity(Xless.shape[1])), Xless.T),ytrain)
Beispiel #47
0
def kmeans(xs, k):
    assert xs.ndim == 2
    try:
        from sklearn.cluster import k_means
        _, labels, _ = k_means(xs.astype("float64"), k)
    except ImportError:
        from scipy.cluster.vq import kmeans2
        _, labels = kmeans2(xs, k, missing='raise')
    return labels
 def initial_means(self,X,n_clusters):
     mean_KMeans_initial,label, intertia = k_means(X, n_clusters, random_state = 1)
     
     KernelKMeans_model = KernelKMeans(n_clusters=n_clusters, random_state=1,
                          kernel="rbf", gamma=None, coef0=1,
                          verbose=0)        
     KernelKMeans_model.fit(X)
     mean_KernelKMeans_initial = self.pre_image(X, KernelKMeans_model.labels_, KernelKMeans_model.gamma, n_clusters, 100)
     return mean_KMeans_initial , mean_KernelKMeans_initial 
Beispiel #49
0
    def learn(self, Xtrain, ytrain):
        """ Learns using the traindata """
        Xless = Xtrain[:,self.features]
        lam = 100 # set lambda for regularizer coefficient

        self.centers = cluster.k_means(Xless, 10)[0]
        Xless = (1 + np.dot(Xless, self.centers.T))**self.d


        self.weights = np.dot(np.dot(np.linalg.inv(np.dot(Xless.T,Xless) + lam*np.identity(Xless.shape[1])), Xless.T),ytrain)
def gap(data, refs=None, nrefs=20, ks=range(1,11)):
    """
    I: NumPy array, reference matrix, number of reference boxes, number of clusters to test
    O: Gaps NumPy array, Ks input list
    
    Give the list of k-values for which you want to compute the statistic in ks. By Gap Statistic 
    from Tibshirani, Walther.
    """
    shape = data.shape
    
    if not refs: 
        tops = data.max(axis=0)
        bottoms = data.min(axis=0)
        dists = scipy.matrix(scipy.diag(tops - bottoms))
        rands = scipy.random.random_sample(size=(shape[0], shape[1], nrefs))
        for i in range(nrefs):
            rands[:, :, i] = rands[:, :, i] * dists + bottoms
    else:
        rands = refs
 
    gaps = scipy.zeros((len(ks),))
    
    for (i,k) in enumerate(ks):
        k_means_args_dict['n_clusters'] = k
        kmeans = k_means(**k_means_args_dict)
        kmeans.fit(data)
        (cluster_centers, point_labels) = kmeans.cluster_centers_, kmeans.labels_

        disp = sum([dst(data[current_row_index, :], cluster_centers[point_labels[current_row_index],:]) for current_row_index in range(shape[0])])

        refdisps = scipy.zeros((rands.shape[2],))

        for j in range(rands.shape[2]):

            kmeans = k_means(**k_means_args_dict)
            kmeans.fit(rands[:, : ,j])
            (cluster_centers, point_labels) = kmeans.cluster_centers_, kmeans.labels_
            refdisps[j] = sum([dst(rands[current_row_index,:,j], cluster_centers[point_labels[current_row_index],:]) for current_row_index in range(shape[0])])
        
        #let k be the index of the array 'gaps'
        gaps[i] = scipy.mean(scipy.log(refdisps)) - scipy.log(disp)
    
    return ks, gaps
def cluster_spatial_data(X, n_parcels, xyz=None, shape=None, mask=None,
                         method='ward', verbose=False):
    """Cluster the data using Ward's algorithm

    Parameters
    ==========
    X: array of shape(n_voxels, n_subjects)
       the functional data, across subjects
    n_parcels: int, the desired number of parcels
    xyz: array of shape (n_voxels, 3), optional
         positions of the voxels in grid coordinates
    shape: tuple: the domain shape (assuming a grid structure), optional
          alternative specification of positions
    mask: arbitrary array of arbitrary dimension,optional
          alternative specification of positions
    method: string, one of ['ward', 'spectral', 'kmeans'], optional
            clustering method

    Returns
    =======
    label: array of shape(n_voxels): the resulting cluster assignment

    Note
    ====
    One of xyz, shape or mask needs to be provided
    """
    from sklearn.cluster import spectral_clustering, k_means
    if mask is not None:
        connectivity = grid_to_graph(*shape, mask=mask)
    elif shape is not None:
        connectivity = grid_to_graph(*shape)
    elif xyz is not None:
        from sklearn.neighbors import kneighbors_graph
        n_neighbors = 2 * xyz.shape[1]
        connectivity = kneighbors_graph(xyz, n_neighbors=n_neighbors)
    else:
        raise ValueError('One of mask, shape or xyz has to be provided')

    if n_parcels == 1:
        return np.zeros(X.shape[0])
    if method == 'ward':
        connectivity = connectivity.tocsr()
        ward = Ward(n_clusters=n_parcels, connectivity=connectivity).fit(X)
        label = ward.labels_
    elif method == 'spectral':
        i, j = connectivity.nonzero()
        sigma = np.sum((X[i] - X[j]) ** 2, 1).mean()
        connectivity.data = np.exp(- np.sum((X[i] - X[j]) ** 2, 1) /
                                      (2 * sigma))
        label = spectral_clustering(connectivity, n_clusters=n_parcels)
    elif method == 'kmeans':
        _, label, _ = k_means(X, n_parcels)
    else:
        raise ValueError('Unknown method for parcellation')
    return label
Beispiel #52
0
def cluster_index_2(X):
    global_mean = np.mean(X, axis=0)

    sum_squared_distances = (((X - global_mean) ** 2).sum(axis=1)).sum()
    # Sum of squared distances of each sample from the global mean

    centroids, labels, inertia = k_means(X, 2)

    ci = inertia / sum_squared_distances

    return ci, labels
Beispiel #53
0
def KMEANS(data, k):

    if data.shape[0] < 20000:
        centroids, cluster_IDs, _ = k_means(data, k, init = 'k-means++', precompute_distances = 'auto', n_init = 20, max_iter = 200)
    else:
        mbkm = MiniBatchKMeans(k, 'k-means++', max_iter = 100, batch_size = data.shape[0] / k, n_init = 20)
        mbkm.fit(data)
            
        centroids = mbkm.cluster_centers_
        cluster_IDs = mbkm.labels_

    return centroids, cluster_IDs
Beispiel #54
0
def smythEmissionDistribution(pair):
	"""
	Given a pair (S: list of sequences, target_m: int), get the emission
	distribution for Smyth's "default" HMM. target_m is an upper bound on the
	number of states -- if we can only have m' distinct observation values, then
	the distribution for a m' state HMM is returned.

	@param pair: A tuple of the form (S: list of sequences, target_m: int)
	@return:  (B, labels, has_zero), where:
	   * S', obs = concat(S), set(S)
	   * m' = min(target_m, len(obs))
	   * [C_0,...,C_{m'-1}] = result of clustering S' with k-means.
	   * labels: tells which cluster each item in merged goes into; i.e.,
	       labels[i] = j, where S'[i] belongs to cluster C_j.
	   * B[i] = (mean(C_i), stddev(C_i)).
	   * has_zero = True if there is i such that B[i][1] ~= 0.0.
	"""
	S, target_m = pair
	# merged list of 1d vectors, set of distinct observation values
	merged, distinct = prepareSeqs(S)

	# m_prime is min of either target_m or the number of distinct obs values 
	m_prime = min(target_m, len(distinct))

	# k-means partitions merged into m_prime clusters [C_0,...,C_{m'-1}].
	# centroids = [c_0,...,c_{m'-1}]: cluster centers; i.e., c_i is the center
	#   of C_j.
	# labels: tells which cluster each item in merged goes into; i.e.,
	#   labels[i] = j, where merged[i] belongs to cluster C_j.
	# inertia: sum of distances of samples to closest cluster center
	#   inertia = sum_{i=0}^{m'-1}(sum_{x in C_i} dist(x, c_i)).
	centroids, labels, inertia = k_means(merged, m_prime, init='k-means++')

	# takes labels and arranges merged into 
	# a list of lists, each of which contains the series from one cluster
	# clusters = [C_0,..,C_{m'-1}]
	clusters = partition(merged, labels)

	# Compute (B, labels, has_zero), where
	#   B[i] = (mean(C_i), stddev(C_i)).
	#   has_zero = True if there is i such that B[i][1] ~= 0.0.
	B = []
	has_zero = False
	for cluster in clusters:
		assert len(cluster) > 0
		mu = mean(cluster)
		stddev = std(cluster)
		B.append((mu, stddev))
		if stddev < 0.001:
			has_zero = True

	return (B, labels, has_zero)
Beispiel #55
0
    def test_k_means(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.cluster.k_means(3, random_state=self.random_state)
        expected = cluster.k_means(iris.data, 3, random_state=self.random_state)

        self.assertEqual(len(result), 3)
        self.assert_numpy_array_almost_equal(result[0], expected[0])

        self.assertTrue(isinstance(result[1], pdml.ModelSeries))
        self.assert_index_equal(result[1].index, df.index)
        self.assert_numpy_array_equal(result[1].values, expected[1])

        self.assertAlmostEqual(result[2], expected[2])
def kcluster(content, num_cluster = 6, num_key = 15, single = False, num_top = -1):
  # num_key is the number of keyword that we extract from a cluster
  # we can find the union of the extracted keys from each cluster

  one_hot_tokens, weight_array, mapping_back, postprocess_sentences, C = get_rakeweight_data(content)

  # in case we have less vector than cluste number
  num_cluster = min(num_cluster, len(weight_array))
  token_weights = defaultdict(float)
  keyword_weights = defaultdict(float)
  k_clusters = cluster.k_means(weight_array, num_cluster)[0]
  union_array = []
  keywords = []
  num_key = min(num_key, len(one_hot_tokens))
  for i,vec in enumerate(k_clusters):
    tmp = sorted(range(len(vec)), key=lambda i: vec[i])[-num_key:]
    union_array = list(set(tmp) | set(union_array))
    for ind in tmp:
      token = one_hot_tokens[ind]
      degree = sum(C[token].values())
      freq = C[token][token]
      # currently degree. update to different weight scheme if needed
      token_weights[token] += float(degree)
  for ind in union_array:
    token = one_hot_tokens[ind]
    keyword = mapping_back[token]
    keywords.append(keyword.encode('ascii'))
    # for all tokens that map to keyword
    keyword_weights[keyword] += token_weights[token]

  keywords = list(set(keywords))
  if single:
    keywords = set(keywords)
    if (num_top < 0):
      num_top = len(keywords)
    return random.sample(keywords, min(len(keywords), num_top))
  # get keyphrases
  keyphrases,keyphrase_freq = get_keyphrases(keywords, postprocess_sentences)
  # keyphrases_weights = sum keyword_weights[word] / total_words
  # for all words in keywords
  keyphrases_weights = get_keyphrase_weights(keyphrases, keyword_weights, keyphrase_freq)
  keyword_weights.update(keyphrases_weights)
  if num_top < 0:
    num_top = len(keyword_weights)/3
  top_keywords = sorted(keyword_weights, key=keyword_weights.get, reverse=True)[:min(num_top, len(keyword_weights))]
#  for keyword in top_keywords:
#    print(keyword + ' '*(40-len(keyword)) + str(keyword_weights[keyword]))
  return top_keywords
def call_k_means(data, n_clusters):
    """
    k_means(X, n_clusters, init='k-means++', precompute_distances=True, n_init=10,
     max_iter=300, verbose=False, tol=0.0001, random_state=None, copy_x=True, n_jobs=1)

    返回:
    centroid
    label
    inertia
    """
    k_clusters, k_labels, k_dis = cluster.k_means(data, n_clusters)
    print "中心点坐标:\n", k_clusters
    print "类标签:\n", k_labels
    print "距离:\n", k_dis

    return k_labels
def kcluster(mapping_back, num_cluster, weight_array, one_hot_tokens, num_key = 3):
  # num_key is the number of keyword that we extract from a cluster
  # we can find the union of the extracted keys from each cluster

  # in case we have less vector than cluste number
  num_cluster = min(num_cluster, len(weight_array))
  k_clusters = cluster.k_means(weight_array, num_cluster)[0]
  union_array = []
  num_key = min(num_key, len(one_hot_tokens))
  for i,vec in enumerate(k_clusters):
    tmp = sorted(range(len(vec)), key=lambda i: vec[i])[-num_key:]
    union_array = list(set(tmp) | set(union_array))
  res = []
  for ind in union_array:
    res.append(mapping_back[one_hot_tokens[ind]])
  return res
Beispiel #59
0
    def learn(self, Xtrain, ytrain):
        """ Learns using the traindata """
        Xless = Xtrain[:,self.features]
        lam = 1000 # set regularizer coeff

        num = Xless.shape[0]
        XKer = np.zeros((num, 10))
        
        self.centers = cluster.k_means(Xless, 10)[0]

        #transform Data
        for i in range(num):
            for j in range(10):
                XKer[i, j] = np.exp(-(np.linalg.norm(Xless[i, ] - self.centers[j]))/2*self.s**2)


        self.weights = np.dot(np.dot(np.linalg.inv(np.dot(XKer.T,XKer) + lam*np.identity(XKer.shape[1])), XKer.T),ytrain)