Beispiel #1
0
def plot_best_Z(motifs, best_Z):
    """Plot the best Z for each motif in each sequence.
    """
    import scipy.cluster.hierarchy as hier
    import scipy.cluster.vq as vq
    fig = pylab.gcf()

    # Cluster (hiearchical) Y axis
    Y = hier.centroid(best_Z)
    axdendro = fig.add_axes([0.01, 0.02, 0.18, 0.96])
    axdendro.set_xticks([])
    axdendro.set_frame_on(False)
    dendro = hier.dendrogram(Y, labels=motifs, orientation='right')
    best_Z_permuted = best_Z[dendro['leaves'], :]

    # K-means cluster X axis
    xcentroid, xlabel = vq.kmeans2(
        best_Z.T, k=num_seq_clusters(best_Z.shape[1]))
    best_Z_permuted = best_Z_permuted[:, numpy.argsort(xlabel)]

    # Plot matrix
    axmatrix = fig.add_axes([0.4, 0.02, 0.5, 0.96])
    im = axmatrix.matshow(best_Z_permuted, aspect='auto', origin='lower')
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])

    # Plot colorbar
    axcolor = fig.add_axes([0.91, 0.02, 0.02, 0.96])
    pylab.colorbar(im, cax=axcolor)
Beispiel #2
0
def plot_collinearity(motifs, best_Z):
    """Plot the cooccurrences of motifs.
    """
    import scipy.cluster.hierarchy as hier
    # from scipy.stats import pearsonr
    M = len(motifs)
    cooccurrences = numpy.ones((M, M))
    for m1 in xrange(M):
        for m2 in xrange(M):
            # both = sum(numpy.logical_and(m1seqs, m2seqs))
            # cooccurrences[m1,m2] = both/float(sum(m2seqs))
            cooccurrences[m1, m2] = \
                numpy.sqrt(sum(best_Z[m1] * best_Z[m2])) \
                / numpy.linalg.norm(best_Z[m2])
            # rho, _ = pearsonr(best_Z[m1], best_Z[m2])
            # cooccurrences[m1, m2] = rho
    Y = hier.centroid(cooccurrences)
    index = hier.fcluster(Y, -1) - 1
    cooccurrences = cooccurrences[index, :]
    cooccurrences = cooccurrences[:, index]
    pylab.pcolor(cooccurrences)
    pylab.colorbar()
    ax = pylab.gca()
    ax.set_xticks([])
    # ax.set_xticks(.5 + numpy.arange(M))
    # ax.set_xticklabels(motifs)
    ax.set_yticks(.5 + numpy.arange(M))
    ax.set_yticklabels(numpy.asarray(motifs)[index])
    ax.set_xlim((0, M))
    ax.set_ylim((0, M))
    for line in ax.yaxis.get_ticklines():
        line.set_markersize(0)
    pylab.gcf().subplots_adjust(left=.27, bottom=.02, top=.98, right=.99)
    def getCentroids(self):
        centroids = {}
        for i in range(int(self.utils.initial_cluster_size),
                       int(self.utils.max_cluster_size) + 1):
            if self.memory[str(i)]['arrayMeas'] != None:
                self.my_logger.debug("GETCENTROIDS state " + str(i) +
                                     " measurements : " +
                                     str(self.memory[str(i)]['arrayMeas']))
                if len(self.memory[str(i)]['arrayMeas']) > 1:
                    #                Y = pdist(self.memory[str(i)]['arrayMeas'], 'seuclidean')
                    Y = self.memory[str(i)]['arrayMeas']
                    #                Z = centroid(Y)
                    #                Z = linkage(Y, 'single') # single, complete, average, weighted, median centroid, ward
                    #                T = fcluster(Z, t=1.0, criterion='distance')
                    T = fclusterdata(self.memory[str(i)]['arrayMeas'],
                                     t=15.0,
                                     criterion='distance',
                                     metric='euclidean',
                                     method='single')
                    #                self.my_logger.debug("GETCENTROIDS state "+ str(i) +" centroids: "+ str(Z))
                    self.my_logger.debug("GETCENTROIDS state " + str(i) +
                                         " clusters: " + str(T))
                    Z = centroid(Y)
                    self.my_logger.debug("GETCENTROIDS state " + str(i) +
                                         " centroid func: " + str(Z))
                else:
                    centroids[str(i)] = {}
                    centroids[str(i)]['throughput'] = self.memory[str(
                        i)]['arrayMeas'][0][0]
                    centroids[str(i)]['latency'] = self.memory[str(
                        i)]['arrayMeas'][0][1]

        self.my_logger.debug("GETCENTROIDS centroids: " + str(centroids))
        return centroids
Beispiel #4
0
def hier_cluster_and_permute(matrix):
    import scipy.cluster.hierarchy as hier
    from scipy.spatial.distance import pdist

    return hier.centroid(matrix)

    D = pdist(matrix)  # upper triangle of distance matrix as vector
    Y = hier.linkage(D, method='single')  # Cluster

    # return permuted matrix and dendrogram
    return Y
Beispiel #5
0
def test_clustering_is_same_as_scipy():
    ''' Basic clustering returns the same as scipy  '''
    features = random_feat_matrix(600, 705)

    scipy_ward = sci_hie.ward(features)
    our_ward = our_hie.clustering(features)

    scipy_centroid = sci_hie.centroid(features)
    our_centroid = our_hie.clustering(features, method='centroid')

    numpy.testing.assert_almost_equal(scipy_ward, our_ward)
    numpy.testing.assert_almost_equal(scipy_centroid, our_centroid)
    return True
Beispiel #6
0
def test_clustering_is_same_as_scipy_2():
    ''' Basic clustering returns the same as scipy 2 '''
    cifti = nibabel.load('./logpar/cli/tests/data/test.dconn.nii')
    features = cifti.get_data()[0, 0, 0, 0]

    scipy_ward = sci_hie.ward(features)
    our_ward = our_hie.clustering(features)

    scipy_centroid = sci_hie.centroid(features)
    our_centroid = our_hie.clustering(features, method='centroid')

    numpy.testing.assert_almost_equal(scipy_ward[:, :2], our_ward[:, :2])
    numpy.testing.assert_almost_equal(scipy_centroid[:, :2],
                                      our_centroid[:, :2])
    return True
Beispiel #7
0
def test_all_neighbors_is_same_as_scipy():
    ''' Clustering without constraints returns the same as scipy  '''
    features = random_feat_matrix(200, 100)
    n = features.shape[0]
    all_neighbors = numpy.ones((n, n)) - numpy.eye(n)

    scipy_ward = sci_hie.ward(features)
    our_ward = our_hie.clustering(features, method='ward',
                                  constraints=all_neighbors)

    scipy_centroid = sci_hie.centroid(features)
    our_centroid = our_hie.clustering(features, method='centroid',
                                      constraints=all_neighbors)

    numpy.testing.assert_almost_equal(scipy_ward, our_ward)
    numpy.testing.assert_almost_equal(scipy_centroid, our_centroid)
Beispiel #8
0
    def getCentroids(self):
        centroids = {}
        for i in range(int(self.utils.initial_cluster_size), int(self.utils.max_cluster_size)+1):
            if self.memory[str(i)]['arrayMeas'] != None:
                self.my_logger.debug("GETCENTROIDS state "+ str(i) +" measurements : "+ str(self.memory[str(i)]['arrayMeas']))
                if len(self.memory[str(i)]['arrayMeas']) > 1:
#                Y = pdist(self.memory[str(i)]['arrayMeas'], 'seuclidean')
                    Y = self.memory[str(i)]['arrayMeas']
#                Z = centroid(Y)
#                Z = linkage(Y, 'single') # single, complete, average, weighted, median centroid, ward
#                T = fcluster(Z, t=1.0, criterion='distance')
                    T= fclusterdata(self.memory[str(i)]['arrayMeas'], t=15.0, criterion='distance', metric='euclidean', method='single')
#                self.my_logger.debug("GETCENTROIDS state "+ str(i) +" centroids: "+ str(Z))
                    self.my_logger.debug("GETCENTROIDS state "+ str(i) +" clusters: "+ str(T))
                    Z = centroid(Y)
                    self.my_logger.debug("GETCENTROIDS state "+ str(i) +" centroid func: "+ str(Z))
                else:
                    centroids[str(i)] = {}
                    centroids[str(i)]['throughput'] = self.memory[str(i)]['arrayMeas'][0][0]
                    centroids[str(i)]['latency'] = self.memory[str(i)]['arrayMeas'][0][1]
        
        self.my_logger.debug("GETCENTROIDS centroids: "+ str(centroids))
        return centroids
Beispiel #9
0
plt.figure(figsize=(15, 10))
h.dendrogram(result)
plt.show()

flat_single = h.fcluster(result, 1394, criterion='distance')

adjusted_rand_score(y.flatten(), flat_single)

adjusted_mutual_info_score(y.flatten(), flat_single)
"""### **Centroid**

Per ogni cluster viene calcolato un *centroide* che rappresenta la media. I cluster vengono uniti in base a i centroidi più simili tra loro. Tali cluster vengono uniti a due a due.
"""

result = h.centroid(X)

plt.figure(figsize=(15, 10))
h.dendrogram(result)
plt.show()

flat_single = h.fcluster(result, 1394, criterion='distance')

adjusted_rand_score(y.flatten(), flat_single)

adjusted_mutual_info_score(y.flatten(), flat_single)
"""## Conclusioni

In conclusione si è visto che per questo dataset K-Means ha prodotto dei cluster molto più raffinati rispetto agli algoritmi gerarchici. Vantaggio per l'algoritmo K-Means è che si conosceva a priori il numero di cluster che dovevano essere creati.

L'implementazione dell'algoritmo K-Means non è prestante rispetto all'implementazione di *sklearn*, tuttavia producono dei risultati simili.
print H.shape
#sono tutti i link effettuati (#esempi-1) e per ciascuno abbiamo
# coppie di cluster uniti, distanza e #esempi contenuti in nuovo cluster
h.dendrogram(H)
pl.show()
#il dendogramma e' lungo perche' c'e' chain effect tipico problema del single link

#comlpete link
H = h.complete(X)
h.dendrogram(H)
pl.show()

#average link
H = h.average(X)
h.dendrogram(H)
pl.show()

#centroid link
H = h.centroid(X)
h.dendrogram(H)
pl.show()
#ci sono delle inversioni perche' la distanza qui non e' monotona

#per ottenere un cluster devo definire una distanza
H = h.average(X)
C = h.fcluster(H, 1.9,
               criterion='distance')  #la soglia 3.5 sembra buona dal grafico
#per vedere il numero di cluster:
print "n cluster:", len(np.unique(C))
print "adj randindex gerarc:", metrics.adjusted_rand_score(C, Y)
Beispiel #11
0
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, centroid
from scipy.spatial.distance import pdist

if __name__ == '__main__':

    gng_output = np.load('data/gng_output.npy')

    # 階層的クラスタリング
    Z = centroid(pdist(gng_output, 'correlation'))
    dendrogram(Z, color_threshold=0.0035)
    plt.show()

    # 階層的クラスタリング
    # Z = linkage(gng_output, metric='correlation', method='average')
    # dendrogram(Z, color_threshold=0.0035)
    # plt.show()

    cl = fcluster(Z, 0.0035, criterion='distance')

    np.save('data/node_labels', cl)
Beispiel #12
0
nclouds = int(nclouds_cusize[0])
cloud_bin = np.array(cloud_bin_netcdf[0, 0:nclouds:])

# Adjust data format for later use:
cloud_lon = cloudlon[0, 0:nclouds]
cloud_lat = cloudlat[0, 0:nclouds]
cloud_size = cloud_bin * size[0]
#cloudcentres = np.vstack((cloud_lon,cloud_lat,cloud_size)).T
cloudcentres = np.vstack((cloud_lon, cloud_lat)).T
labels = np.arange(nclouds)

# Compute distances for all pairs based on White et al 2018:
Y = distance.pdist(cloudcentres, haversine)

# Compute linkage matrix:
Z = centroid(Y)
max_d = 8000

# Plot dendrogram:
plt.figure(figsize=(25, 10))
plt.xlabel('Cloud label')
plt.ylabel('Euclidian distance [m]')
dendrogram(
    Z,
    leaf_rotation=90.,
    truncate_mode='lastp',
    p=50,
    #show_leaf_counts=False,
    show_contracted=True,
    color_threshold=max_d,
)
Beispiel #13
0
if __name__ == '__main__':

    fits = iofits.open('data/fits/fpC-001729-r3-0083.fit.gz')
    img = fits[0].data

    with open('data/galaxies', 'rb') as f:
        galaxies = pickle.load(f)

    matrix3 = np.load('data/matrix3.npy').astype(np.float64)

    wcs = WCS(fits[0].header)

    for i in range(len(matrix3)):
        matrix3[i] /= matrix3[i].sum()

    Z = centroid(pdist(matrix3, 'correlation'))
    maxclust = 6
    ct = Z[-(maxclust - 1), 2]
    cluster = fcluster(Z, maxclust, criterion='maxclust')
    dendrogram(Z, color_threshold=ct)
    plt.show()

    for cls in range(cluster.max()):
        cls += 1
        idxs = np.where(cluster == cls)[0]
        for idx in idxs:
            galaxy = galaxies[idx]
            if len(galaxy) == 0:
                continue

            x, y = calc_coord_ave(galaxy)
Beispiel #14
0
kmeans = KMeans(n_clusters=17).fit(df)
centroids = kmeans.cluster_centers_
print(centroids)

plt.scatter(df['x'], df['y'], c=kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
plt.show()

# =============================================================================
# ¿PERO REALMENTE 4 grupos son los suficienteS?
# DENDROGRAMA!!!
# =============================================================================
#https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html

centr = centroid(df)
labelList = range(len(x))

plt.figure(figsize=(10, 7))
dendrogram(centr,
           orientation='top',
           labels=labelList,
           distance_sort='descending',
           show_leaf_counts=True)
plt.title('Dendrograma using centroid method')
plt.show()

# =============================================================================
# ALERTA! Hay más técnicas de clustering
# =============================================================================
for i in range(0,len(numpyA)):
        numpyA[i][1]+=0.1
for i in range(0,2):
    t1 = time()
    if i ==0:
        y=pdist(numpyA,'cosine')
        a=max(y)
        b=min(y)
        y=squareform(y)
        for k in range(0,len(y)):
            for j in range(0,len(y[0])):
                y[k][j]= ((y[k][j]-b)/(a-b))+b        
    elif i ==1:
        y=pdist(numpyA,'euclidean')
        y=squareform(y)    
    z=h.centroid(y)
    if i==0: print "cosine, hierarchy clustering time = "+str(time()-t1)
    elif i==1: print "euclidean, hierarchy clustering time = "+str(time()-t1)
    
    if i ==0:
        np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/hierarchy_label/cosine_linkage.csv", z, '%5.2f',delimiter=",")
    elif i==1: np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/hierarchy_label/euclidean_linkage.csv", z, '%5.2f',delimiter=",")
    j = 4
    while(j<129):

        result = h.fcluster(z, z[len(z)-j][2],'distance')
        if i==0:
            np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/hierarchy_label/cosine_"+str(j)+".csv", result, '%i',delimiter=",")
        elif i==1:
            np.savetxt("/Users/mengqizhou/Desktop/datamining/programing3/data/hierarchy_label/euclidean_"+str(j)+".csv", result, '%i',delimiter=",")
        j*=2
Beispiel #16
0
# 层次聚类有许多算法。可以使用matplotlib绘制结果
import matplotlib.pyplot as plt

wine_complete = hierarchy.complete(wine)
fig = plt.figure()
dn = hierarchy.dendrogram(wine_complete)
plt.show()

wine_single = hierarchy.single(wine)
fig = plt.figure()
dn = hierarchy.dendrogram(wine_single)
plt.show()

wine_averge = hierarchy.average(wine)
fig = plt.figure()
dn = hierarchy.dendrogram(wine_averge)
plt.show()

wine_centroid = hierarchy.centroid(wine)
fig = plt.figure()
dn = hierarchy.dendrogram(wine_centroid)
plt.show()

wine_complete = hierarchy.complete(wine)
fig = plt.figure()
dn = hierarchy.dendrogram(
    wine_complete,
    # 默认MATLAB阈值
    color_threshold=0.7 * max(wine_complete[:, 2]),
    above_threshold_color='y')
plt.show()