def get_constant_height_labels(clustering, n_clusters=None):
    """
    use silhouette analysis to select the best heigh to cut a linkage matrix
    :df: a correlation matrix
    parse_heatmap: int (optional). If defined, devides the columns of the 
                    heatmap based on cutting the dendrogram
    """
    N_variables = len(clustering['reorder_vec'])
    scores = []
    if n_clusters is None:
        for k_clusters in range(2,N_variables//3):
            labels = cut_tree(clustering['linkage'], n_clusters=k_clusters)
            try:
                score = silhouette_score(clustering['distance_df'], 
                                         labels.ravel(), metric='precomputed')
            except ValueError:
                continue
            scores.append((k_clusters,score))
        best_k = max(scores, key=lambda x: x[1])[0]
        labels = cut_tree(clustering['linkage'], n_clusters=best_k)

    else:
        labels = cut_tree(clustering['linkage'], n_clusters=n_clusters)
        score = silhouette_score(clustering['distance_df'], 
                                         labels, metric='precomputed')
        scores.append((n_clusters, score))
    labels = reorder_labels(labels.flatten(), clustering['linkage'])
    # comparison
    MI = adjusted_mutual_info_score(labels, clustering['labels'])
    return labels, scores, MI
Beispiel #2
0
def get_constant_height_labels(clustering, n_clusters=None):
    """
    use silhouette analysis to select the best heigh to cut a linkage matrix
    :df: a correlation matrix
    parse_heatmap: int (optional). If defined, devides the columns of the 
                    heatmap based on cutting the dendrogram
    """
    N_variables = len(clustering['reorder_vec'])
    scores = []
    if n_clusters is None:
        for k_clusters in range(2, N_variables // 3):
            labels = cut_tree(clustering['linkage'], n_clusters=k_clusters)
            try:
                score = silhouette_score(clustering['distance_df'],
                                         labels.ravel(),
                                         metric='precomputed')
            except ValueError:
                continue
            scores.append((k_clusters, score))
        best_k = max(scores, key=lambda x: x[1])[0]
        labels = cut_tree(clustering['linkage'], n_clusters=best_k)

    else:
        labels = cut_tree(clustering['linkage'], n_clusters=n_clusters)
        score = silhouette_score(clustering['distance_df'],
                                 labels,
                                 metric='precomputed')
        scores.append((n_clusters, score))
    labels = reorder_labels(labels.flatten(), clustering['linkage'])
    # comparison
    MI = adjusted_mutual_info_score(labels, clustering['labels'])
    return labels, scores, MI
Beispiel #3
0
    def compute_vi(self, gtmap, num_parts=0):
        """Return variation of information based on provided ground truth maps.

        Args:
            gtmap (dict): body/neuronid => label or type
            num_parts (int): optional specification of number of parititons to use (default=0, find optimal)
        Returns:
            (float64, float64, dataframe, numparts): merge vi, split vi, bodyids and score, and num partitionse
        """

        # find optimal match unless one is specified
        from scipy.cluster.hierarchy import cut_tree
       
        partitions = None
        if num_parts > 0:
            partitions = cut_tree(self.cluster, n_clusters=num_parts)
            return _vi_wrapper(list(partitions[:.0]), self.labels, gtmap) 

        # iterate through all partitionings and find best match
        bestmatch = 99999999999999
        partitions = cut_tree(self.cluster)
        bestres = None
        for colid in range(0, len(partitions[0,:])):
            merge, split, bodyrank = _vi_wrapper(list(partitions[:,colid]), self.labels, gtmap) 
            if (merge + split) < bestmatch:
                bestmatch = (merge+split)
                bestres = (merge, split, bodyrank, len(gtmap)-colid)

        return bestres
def applyHierarchiqueClusteringFromDataset(metadataDataset,distanceMatrixComplete,parameter=5,typeOfHierarchical=HIERARCHICAL_FIXED_NUMBER_OF_CLUSERS,method=HIERARCHICAL_COMPLETE_LINKAGE): #single;#average;#complete;#weighted;#centroid;#median;#ward
    clusteringResults=[dict(obj) for obj in metadataDataset]
    innerMatrix,mapRowsID,mapColumnsID=getInnerMatrix(distanceMatrixComplete)
    
#     for i in range (len(innerMatrix)) :
#         innerMatrix[i][i]=float(0)
#     
    for index,row in enumerate(innerMatrix) :
        for column,val in enumerate(row) :
            if not innerMatrix[column][index] == innerMatrix[index][column] :
                if (math.isnan(innerMatrix[index][column])) :
                    innerMatrix[index][column]=1.
                #print innerMatrix[index][column],'-',innerMatrix[column][index] ##ERREURE d'ARRONDIE
                else :
                    innerMatrix[index][column]=innerMatrix[column][index] ##ERREURE d'ARRONDIE
                    
    distArray = ssd.squareform(innerMatrix)
    
        
    linkageMatrix=linkage(distArray, method)
    
    if (typeOfHierarchical==HIERARCHICAL_FIXED_NUMBER_OF_CLUSERS):
        cutree = hierarchy.cut_tree(linkageMatrix, n_clusters=[parameter, parameter])
    elif (typeOfHierarchical==HIERARCHICAL_SIMPLE):
        cutree = hierarchy.cut_tree(linkageMatrix, height =[parameter, parameter])
    cuttreeclusters = [(k,v[0]) for k,v in enumerate(cutree.tolist())]
    clusters={}
    for value in iter(cuttreeclusters):
        clusteringResults[value[0]]['CLUSTER']=str(value[1])
        if not clusters.has_key(str(value[1])) :
            clusters[str(value[1])]=0
        clusters[str(value[1])]+=1
    
    
    return clusteringResults,clusters,linkageMatrix
def main():
    digits_embedding = genfromtxt('digits-embedding.csv', delimiter=',')

    np.random.seed(0)
    data = []
    for i in range(10):
        class_i_digits = digits_embedding[digits_embedding[:, 1] == i]
        digits = np.random.randint(0, len(class_i_digits), size=10)
        for digit in digits:
            data.append(class_i_digits[digit])
    data = np.array(data)

    # plt.scatter(data[:,2],data[:,3], c=data[:,1])
    # plt.show()
    '''
    plot dendograms
    '''
    methods = ['single', 'complete', 'average']
    features = data[:, 2:4]
    k_list = [2, 4, 8, 16, 32]
    # print (features[:20])
    for method in methods:
        Z = linkage(features, method=method)

        max_d = 500
        plot_dendogram(Z, max_d, 10, 'dendogram_' + method)
        wc_ssd_values = []
        sc_values = []

        for k in k_list:

            cluster_indices = cut_tree(Z, k)
            features_labels = np.column_stack((features, cluster_indices))
            centroids = {}
            for cluster_id in range(k):
                cluster_members = features[features_labels[:, 2] == cluster_id]
                centroid = np.average(cluster_members, axis=0)
                centroids[cluster_id] = centroid
            wc_ssd_values.append(
                get_wc_ssd(centroids, features, features_labels[:, 2]))
            sc_values.append(get_SC(features, features_labels[:, 2]))
        # print ("Method", method, "WC-SSD", wc_ssd_values)
        # print ("Method", method, "SC", sc_values)
        plot_graph(k_list, wc_ssd_values, 'k (number of clusters)', 'WC-SSD',
                   ['Sub Sample 100 images, method ' + method],
                   'hierarchical_learning_curve_wc_ssd_' + str(method))
        plot_graph(k_list, sc_values, 'k (number of clusters)', 'SC',
                   ['Sub Sample 100 images, method ' + method],
                   'hierarchical_learning_curve_sc_' + str(method))

    print("We chose K=8 for all 3 methods single, compelete, average")

    for method in methods:
        Z = linkage(features, method=method)
        k = 8
        cluster_indices = cut_tree(Z, k)
        features_labels = np.column_stack((features, cluster_indices))
        nmi = get_NMI(features, features_labels[:, 2], data[:, 1])
        print("For method", method, "NMI:", nmi)
Beispiel #6
0
 def get_labels(self, what=None, n_clusters=2):
     if what == "row":
         labels = hierarchy.cut_tree(self.row_linkage, n_clusters)
     elif what == "col":
         labels = hierarchy.cut_tree(self.col_linkage, n_clusters)
     else:
         print 'Error: what must be "row" or "col"'
     return labels
Beispiel #7
0
def divide_computation(Sigma, max_block):
    """
    Approximates a correlation matrix Sigma as a block-diagonal matrix
    using hierarchical clustering. Roughly follows the R knockoff package.

    Parameters
    ----------
    Sigma : np.ndarray
        ``(p, p)``-shaped covariance matrix of X
    max_size : int
        Maximum size of a block in the block-diagonal approximation.

    Returns
    -------
    blocks : np.ndarray
        ``(p, )``-shaped numpy array where ``blocks[i] == j`` indicates
        that variable ``i`` belongs to block ``j``.
    """

    # Correlation tree. We add noise to deal with highly structured Sigma.
    p = Sigma.shape[0]
    noise = np.random.randn(p, p) * 1e-6
    noise += noise.T
    Sigma = Sigma + noise
    link = dgp.create_correlation_tree(Sigma)

    # Set up binary search
    max_clusters = p
    min_clusters = 1
    prev_max_clusters = p
    prev_min_clusters = 1

    # Binary search to create clusters
    for j in range(100):
        # Create new groups and check maximum size
        n_clusters = int((max_clusters + min_clusters) / 2)
        groups = hierarchy.cut_tree(link, n_clusters).reshape(-1) + 1
        current_max_block = utilities.calc_group_sizes(groups).max()

        # Cache search info and check maximum size
        prev_max_clusters = max_clusters
        prev_min_clusters = min_clusters
        if current_max_block > max_block:
            min_clusters = n_clusters
        else:
            max_clusters = n_clusters
        # Break if nothing has changed between iterations
        if min_clusters == prev_min_clusters and max_clusters == prev_max_clusters:
            if current_max_block > max_block:
                groups = hierarchy.cut_tree(link,
                                            n_clusters + 1).reshape(-1) + 1
            break

    return merge_groups(groups, max_block)
Beispiel #8
0
def part2(computedTFIDF, showDendograms=False):
  startTime = time.time()
  runningTotalTime=0

  print("Executing code for Part 2...\n")

  print("Creating and cutting single link clusters...")
  singleCluster = single(computedTFIDF.similarityMatrix)
  singleClusterCut = cut_tree(singleCluster, n_clusters=[i for i in range(0, computedTFIDF.docCount-1)])
  singleClusterTime = round(time.time() - startTime, 3)
  runningTotalTime+=singleClusterTime
  print("Time: " + str(singleClusterTime) + " seconds")

  print("Creating list of single link clusters each document is contained in...")
  finalSingleClustering = singleClusterCut[len(singleClusterCut)-1]
  documentClusters=createDocumentCluster(finalSingleClustering, computedTFIDF)
  singleTrackingTime = round(time.time() - startTime - runningTotalTime, 3)
  runningTotalTime+=singleTrackingTime
  print("Time: " + str(singleTrackingTime) + " seconds")

  print("Writing single link clusters to file...")
  writeToFile(documentClusters, 'single.txt')
  singleWritingTime = round(time.time() - startTime - runningTotalTime, 3)
  runningTotalTime+=singleWritingTime
  print("Time: " + str(singleWritingTime) + " seconds")

  print("Creating and cutting complete link clusters...")
  completeCluster = complete(computedTFIDF.similarityMatrix)
  completeClusterCut = cut_tree(completeCluster, n_clusters=[i for i in range(0, computedTFIDF.docCount-1)])
  completeClusterTime = round(time.time() - startTime - runningTotalTime, 3)
  runningTotalTime+=completeClusterTime
  print("Time: " + str(completeClusterTime) + " seconds")

  print("Creating list of complete link clusters each document is contained in...")
  finalCompleteClustering = completeClusterCut[len(completeClusterCut)-1]
  completeDocumentClusters=createDocumentCluster(finalCompleteClustering, computedTFIDF)
  completeTrackingTime = round(time.time() - startTime - runningTotalTime, 3)
  runningTotalTime+=completeTrackingTime
  print("Time: " + str(completeTrackingTime) + " seconds")

  print("Writing complete link clusters to file...")
  writeToFile(completeDocumentClusters, 'complete.txt')
  completeWritingTime = round(time.time() - startTime - runningTotalTime, 3)
  runningTotalTime+=completeWritingTime
  print("Time: " + str(completeWritingTime) + " seconds")

  if showDendograms:
    displayDendogram(completeCluster, 'Single')
    displayDendogram(completeCluster, 'Complete')

  print('\nPart 2 Complete')
  print("Execution Time: " + str(round(time.time() - startTime, 3)) + " seconds\n")
  
  return documentClusters, completeDocumentClusters
Beispiel #9
0
 def fit(self, model, X, verbose):
     if verbose:
         print('Fitting variable:' + str(self.original_name))
     pdp, names = self._get_partial_dependence(model, X)
     self.pdp = pdp
     self.axes = names
     if pdp.ndim == 1:
         arr = np.reshape(pdp, (len(pdp), 1))
     else:
         arr = pdp
     self.Z = ward(arr)
     if pdp.shape[0] == 3:
         self.clusters = cut_tree(self.Z,
                                  height=self.Z[0, 2] -
                                  sys.float_info.epsilon)
         self.new_names = []
         for cluster in range(len(np.unique(self.clusters))):
             names = []
             for idx, c_val in enumerate(self.clusters):
                 if c_val == cluster:
                     if idx == 0:
                         names.append('base')
                     else:
                         names.append(
                             self.dummy_names[idx -
                                              1][len(self.original_name) +
                                                 1:])
             self.new_names.append(self.original_name + '_' +
                                   "_".join(names))
     elif pdp.shape[0] > 3:
         kneed = KneeLocator(range(self.Z.shape[0]),
                             self.Z[:, 2],
                             direction='increasing',
                             curve='convex')
         if kneed.knee is not None:
             self.clusters = cut_tree(self.Z,
                                      height=self.Z[kneed.knee + 1, 2] -
                                      sys.float_info.epsilon)
             self.new_names = []
             for cluster in range(len(np.unique(self.clusters))):
                 names = []
                 for idx, c_val in enumerate(self.clusters):
                     if c_val == cluster:
                         if idx == 0:
                             names.append('base')
                         else:
                             names.append(self.dummy_names[idx - 1]
                                          [len(self.original_name) + 1:])
                 self.new_names.append(self.original_name + '_' +
                                       "_".join(names))
     return self
Beispiel #10
0
def fms_compare(XX, YY, npoints, plot_title, plot_save):
    #Clustering
    ZXc = hierarchy.linkage(XX, method=clustering_method)
    ZYc = hierarchy.linkage(YY, method=clustering_method)

    #Cut dendrogram to obtain labelling for each k value
    #Warning: using hierarchy.cut_tree, but this function has a known bug!
    fms_dict = {}
    mean_dict = {}
    mean_dict[npoints]=0
    varbound_dict = {}
    varbound_dict[npoints]=0
    for i in range(1,npoints+1):
        ZXc_cut = [l for sublist in hierarchy.cut_tree(ZXc, i) for l in sublist]
        ZYc_cut = [l for sublist in hierarchy.cut_tree(ZYc, i) for l in sublist] 

        #Compute FM scores
        score = fms(ZXc_cut, ZYc_cut)
        fms_dict[i] = score

        #Compute moments for plotting and analysis
        c = contingency_matrix(ZXc_cut, ZYc_cut, sparse=True)
        tk = np.dot(c.data, c.data) - npoints
        pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - npoints
        qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - npoints
        pk2 = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 3) - 3*(np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2)) + 2*(np.sum(np.asarray(c.sum(axis=0)).ravel())) 
        qk2 = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 3) - 3*(np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2)) + 2*(np.sum(np.asarray(c.sum(axis=1)).ravel()))  
        if i < npoints:
            mean = (np.sqrt(pk*qk)) / (npoints*(npoints-1))
            mean_dict[i] = mean
            variance = (2/(npoints*(npoints-1))) + ((4*pk2*qk2)/(npoints*(npoints-1)*(npoints-2)*pk*qk))+ (((pk-2-((4*pk2)/pk))*(qk-2-((4*qk2)/qk)))/(npoints*(npoints-1)*(npoints-2)*(npoints-3))) - ((pk*qk)/((npoints**2)*((npoints-1)**2))) 
            varbound_dict[i] = 2* (variance**0.5)

    #Plot Bk and variance bounds
    lists = sorted(fms_dict.items())
    x, z = zip(*lists)
    upper = [mean_dict[i]+varbound_dict[i] for i in x]
    lower = [mean_dict[i]-varbound_dict[i] for i in x]
    means = [mean_dict[i] for i in x]

    #plt.plot(x,z)
    plt.scatter(x,z)
    plt.plot(x,upper)
    plt.plot(x, means)
    plt.plot(x,lower)
    plt.title(plot_title)
    plt.xlabel('# clusters')
    plt.ylabel('B_k')
    plt.savefig(path_fm_plot+ plot_save+'.jpg')
    plt.clf()
Beispiel #11
0
def gapStatistic(original, data, Z, nref=3, maxClusters=15):
    gaps = np.zeros((len(range(1, maxClusters)), ))
    # gapdf = pd.DataFrame({'clusterCount':[], 'gap':[]})
    wkbs = np.zeros(len(range(1, maxClusters)))

    # minMaxArr = bounding_box(data)
    for gap_index, k in enumerate(range(1, maxClusters)):
        print('\ncalculating reference disp. K = ', k)

        # For n references, generate random sample
        # and get clustering result at each level k
        # Holder for reference dispersion results
        bWkbs = np.zeros(nref)
        for i in range(nref):
            # print('i', i)
            # Create new random reference set
            # randomReference = np.dot(np.random.random_sample(size=data.shape), maxx)
            # randomReference = generateRandomVecs(data.shape, minMaxArr)
            randomReference = generateRandomVecsBySampling(
                original, data.shape)

            # cluster to it
            refZ = hierarchy.linkage(randomReference, 'average')
            refClusterLabels = hierarchy.cut_tree(refZ, k)
            refCenters, refClusters = clusterCenters(randomReference,
                                                     refClusterLabels, k)
            bWkbs[i] = np.log(Wk(refCenters, refClusters))

        # everage Wk for the reference sample groups
        wkbs[gap_index] = sum(bWkbs) / nref

        # cluster to original data and create dispersion
        print('\ncalculating original disp.')
        clusterLabels = hierarchy.cut_tree(Z, k)
        # print('clusterLabels', np.unique(clusterLabels))
        centers, clusters = clusterCenters(data, clusterLabels, k)

        # Calculate gap statistic
        gap = wkbs[gap_index] - np.log(Wk(centers, clusters))

        # Assign this loop's gap statistic to gaps
        gaps[gap_index] = gap

        # gapdf = gapdf.append({'clusterCount':k, 'gap':gap}, ignore_index=True)

    optK = gaps.argmax() + 1

    return optK
Beispiel #12
0
def worker_cut_tree(nClu, Z=None):
    print nClu,
    if nClu <= 1:
        res = None
    else:
        res = sphier.cut_tree(Z, n_clusters=nClu)
    return res
Beispiel #13
0
def filter_points(gdf, min_dist=0, remove="first"):
    """Filter points in geodataframe using a minimum distance buffer.

    Args
    ----
    gdf : Geopandas GeoDataFrame
        Containing point geometries.

    min_dist : int or float, optional (default=0)
        Minimum distance by which to filter out closely spaced points.

    remove : str, optional (default='first')
        Optionally choose to remove 'first' occurrences or 'last'
        occurrences.

    Returns
    -------
    xy : 2d array-like
        Numpy array filtered coordinates
    """
    xy = gdf.geometry.bounds.iloc[:, 0:2]

    Z = linkage(xy, "complete")
    tree_thres = cut_tree(Z, height=min_dist)
    gdf["tree_thres"] = tree_thres

    if remove == "first":
        gdf = gdf.groupby(by="tree_thres").first()

    elif remove == "last":
        gdf = gdf.groupby(by="tree_thres").last()

    return gdf
Beispiel #14
0
def doHierachicalClustering(matrixAudioDataTransformed, threshold=0.995):
    global distanceMatrix

    from scipy.cluster import hierarchy as h
    from scipy.spatial import distance as dist

    distanceFunction = 'cosine'  #canberra, cityblock, braycurtis, euclidean
    linkageType = 'average'  #single, complete, weighted, average

    print("Distance function:", distanceFunction)
    print("Linkage type:", linkageType)

    tic = time.clock()

    distanceMatrix = dist.pdist(matrixAudioDataTransformed, distanceFunction)
    clusters = h.linkage(distanceMatrix, linkageType)
    c, d = h.cophenet(clusters, distanceMatrix)  #factor cofonético

    toc = time.clock()

    print("Cophenet factor:", c)
    print("time:", toc - tic)

    # THRESHOLD = 0.995
    #THRESHOLD = 0.92
    cutTree = h.cut_tree(clusters, height=threshold)

    return cutTree
def process_hierarchy(inf, h, method):
    df = pd.read_csv(inf, header=0, index_col=0)
    df = df.fillna(0)
    strains = df.index
    df = 1 - (df / 100)
    df_v = ssd.squareform(
        df, force='tovector',
        checks=False)  # flatten matrix to condensed distance vector
    if method == 'single':
        li = sch.single(df_v)
    elif method == 'complete':
        li = sch.complete(df_v)
    elif method == 'average':
        li = sch.average(df_v)
    elif method == 'weighted':
        li = sch.weighted(df_v)
    else:
        print('\nERROR: Please enter a valid clustering method\n')
        sys.exit()
    hclus = cut_tree(
        li, height=h
    )  # using the height (percent ID as decimal, for example), cluster OFUs from dendrogram
    hclus = pd.DataFrame(hclus, index=strains)
    hclus.ix[:,
             0] += 1  # cut_tree defaults to the first 'cluster' being named "0"; this just bumps all IDs +1
    return hclus
Beispiel #16
0
def aggregate_panels(panels: List[PanelProfile]) -> List[PanelGroupProfile]:
    """
    the function takes a list of Panels then groups them into a list of PanelGroupParts
    :param panels: a list of Panels
    :return: a list of PanelGroupParts
    """
    vertices = list()
    for p in range(len(panels)):
        panel = panels[p]
        points = panel.points_rc
        x, y = points[0][1], points[0][0]
        w, h = points[1][1] - x, points[1][0] - y
        vertices.extend([[i, j, p] for i in range(x, x + w, 5)
                         for j in range(y, y + h, 5)])
        vertices.extend([[x, y + h, p], [x + w, y, p], [x + w, y + h, p]])
    linkage_matrix = linkage(np.array(vertices)[:, :2],
                             method="single",
                             metric="chebyshev")
    ctree = cut_tree(linkage_matrix, height=[10])
    cluster = [x[0] for x in ctree]

    panel_groups = list()
    for group in range(max(cluster) + 1):
        vertices_group = [
            vertices[i] for i in range(len(vertices)) if cluster[i] == group
        ]
        contour = cv2.convexHull(np.array([x[:2] for x in vertices_group]))
        panel_index = set([x[2] for x in vertices_group])
        panel_group = PanelGroupProfile(contour=contour)
        for i in panel_index:
            panel_group.add_panel(panels[i])
        panel_groups.append(panel_group)
    return panel_groups
    def classbyUPGMA(self,obstimes,trainingtimes,obsnodes):
        self._classorder=["C1"]
        self._classscore["C1"] = score()

        for u,v in itertools.combinations(obsnodes.keys(),2): #to predict new pair of nodes and initialize the classes
        	link = frozenset([u,v])
        	self._classUnion.addPair(link)
        	if link not in obstimes:
        		self._classscore["C1"].addPair(link)
        learningperiods=dict()
        for link in obstimes:
            if link in trainingtimes:
                learningperiods[link] = obstimes[link]+trainingtimes[link]
            else:
                learningperiods[link] = obstimes[link]
        Y,self._label=classes.Makedistmatx(learningperiods,self._VandPparameter,0,0)
        self._linkage=hierarchy.average(Y)
        cutree = hierarchy.cut_tree(self._linkage,self._nbcluster)


        for i in range(self._nbcluster):
            self._classscore["C"+str(i+2)] = score()
            self._classorder.append("C"+str(i+2))

        for i in range(len(self._label)):
            u=self._label[i]
            self._classscore["C"+str(cutree[i][0]+2)].addPair(u)
Beispiel #18
0
def problemTwo(X, cities, method, metric, height, plot):
    # 设置distance_threshold=0可以确保计算完整的树。
    model = AgglomerativeClustering(linkage="single",
                                    affinity="euclidean",
                                    n_clusters=None,
                                    distance_threshold=0)
    model = model.fit(X)

    plt.rcParams['font.size'] = 12

    plt.figure(figsize=[8, 6])
    plt.title('Hierarchical Clustering Dendrogram (%s)' % method)
    # 标出树木图的前三个层次
    Z = linkage(X, method=method, metric=metric)
    dendrogram(Z, labels=cities, orientation="right")

    # plot_dendrogram(model, cities)
    plt.xlabel("Height")

    if plot:
        plt.show()
        sns.clustermap(X, method=method, metric=metric, figsize=[8, 6])
        plt.show()

    label = cut_tree(Z, height=height)
    label = label.reshape(label.size, )

    PCA(X=X, label=label, cities=cities, method=method, height=height)
Beispiel #19
0
def createCut(a, w, r, s, n):
    name = ','.join(map(str, [a, w, r, s, n]))

    with open(codebasesPath + codebaseName +
              "/analyser/similarityMatrix.json") as f:
        similarityMatrix = json.load(f)

    matrix = similarityMatrix["matrix"]
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            matrix[i][j] = matrix[i][j][0] * a / 100 + \
                            matrix[i][j][1] * w / 100 + \
                            matrix[i][j][2] * r / 100 + \
                            matrix[i][j][3] * s / 100

    matrix = np.array(matrix)

    hierarc = hierarchy.linkage(y=matrix, method=linkageType)

    cut = hierarchy.cut_tree(hierarc, n_clusters=n)

    clusters = {}
    for i in range(len(cut)):
        if str(cut[i][0]) in clusters.keys():
            clusters[str(cut[i][0])] += [entities[i]]
        else:
            clusters[str(cut[i][0])] = [entities[i]]

    clustersJSON = {}
    clustersJSON["clusters"] = clusters

    with open(
            codebasesPath + codebaseName + "/analyser/cuts/" + name + ".json",
            'w') as outfile:
        outfile.write(json.dumps(clustersJSON, indent=4))
def cut_dendrogram(hierarc, linkage_type, height):
    # EMPTY DICTIONARIES FOR REPEATED CUTS
    dict_class_cluster.clear()
    dict_cluster_controller_access.clear()
    dict_ctrl_class_percentage.clear()

    cut = hierarchy.cut_tree(hierarc, height=height)

    list_unique_classes = sorted(list(get_all_controller_classes()))

    for i in range(0, len(list_unique_classes)):
        dict_class_cluster.setdefault(cut[i][0],
                                      []).append(list_unique_classes[i])

    ###################################################
    # Statistics generation
    f = open('statistics_2_' + linkage_type + '.txt', 'w')
    print("-----Clusters from Dendrogram-----", file=f)

    for key, value in dict_class_cluster.items():
        print(key, value, file=f)
    f.close()
    # Calculate percentages
    calculate_cluster_controller_access(linkage_type)
    calculate_controller_percentage_classes(linkage_type)

    return dict_cluster_controller_access
    def extract_cell_name_from_clusters(self, n_clusters=5):
        Z = self.Z

        labels = list(self.df.columns)

        clusters = cut_tree(Z, n_clusters=n_clusters)
        clusters.reshape(clusters.shape[0])

        grouped = {}
        for i, cluster in enumerate(clusters):
            if str(cluster[0]) not in grouped:
                grouped[str(cluster[0])] = []

            grouped[str(cluster[0])].append(labels[i])

        grouped_inv = {}
        for (k, v) in grouped.items():
            for cell_name in v:
                grouped_inv[cell_name] = k

        df = pd.DataFrame(data=map(
            lambda cell_cluster: [cell_cluster[0], cell_cluster[1]],
            grouped_inv.items(),
        ))
        df.columns = ["cell_name", "cluster"]
        return df
Beispiel #22
0
def clustering(pt, kmin,kmax,distance,linkagem):
    """
    Description:
        Computes the clustering for the pathways matrix and returns a dataframe with the groups with k clusters from kmin to kmax.

    Inputs:
        pt      : Pathway scores.
        kmin    : Min number of groups in the clustering.
        kmax    : Max number of groups in the clustering.
        distance: Similarity metric for the clustering. Default is binomial distance but any distance from scipy.spatial.distance can be used.
        linkagem: Linkage method. Default is complete.

    Outputs:
        df      : Cluster assignement dataframe.    
    """
    tinit = time.time()
    if(distance=="binomial"):
        Y = pdist(pt.transpose(),binomial_dist) # Computes the distance matrix. pt is transposed because the pdist function takes rows as input and what we want to cluster are the samples.
    else:
        Y = pdist(pt.transpose(),distance)
    Z = linkage(Y,linkagem) #Linkage of the clusters using the distance matrix and the complete method.
    np.savetxt("dist_matrix.csv",squareform(Y),delimiter=",")#Saves the distance matrix. Note that the output of pdist is a condensed matrix!!

    #Building the output dataframe.
    df = pd.DataFrame()
    for k in range(kmin,kmax+1):
        R = cut_tree(Z,k) #Cuts the tree at k groups.
        u = [item for sublist in R for item in sublist]
        df["X"+str(k)] =u
        df.index = pt.columns
    tfin = time.time()
    print("Clustering runtime: ",tfin-tinit)
    return df.transpose()
Beispiel #23
0
def generate_groups(linkage_matrix: np.array, sample_ids: list or np.array,
                    n_groups: int):
    """
    Given the output of SimilarityMatrix (that is the linkage matrix and ordered list of sample
    IDs) and a desired number of groups, return a Pandas DataFrame of sample IDs and assigned group ID, generated by
    cutting the linkage matrix in such a way that the desired number of groups are generated.
    Parameters
    ----------
    linkage_matrix: np.array
        Linkage matrix generated from EvaluateBatchEffects.similarity_matrix (using SciPy.cluster.hierarchy.linkage)
    sample_ids: list or np.array
        Ordered list of sample IDs generated from EvaluateBatchEffects.similarity_matrix
    n_groups: int
        Desired number of groups
    Returns
    -------
    Pandas.DataFrame
    """
    groups = pd.DataFrame({
        'sample_id':
        sample_ids,
        'group':
        list(
            map(lambda x: x + 1,
                hierarchy.cut_tree(linkage_matrix, n_groups).flatten()))
    })
    groups = groups.sort_values('group')
    return groups
Beispiel #24
0
 def get_optimal_clustering(self):
     X = range(20, 40)
     c = cut_tree(self.Z, height=X)
     Y = np.apply_along_axis(self.unique_ele_count, axis=0, arr=c)
     optimal_cluster_count = Counter(Y).most_common(1)[0][0]
     idx = max(np.where(Y == optimal_cluster_count)[0])
     return c[:, idx]
Beispiel #25
0
    def get_partitions(self, num_parts, return_max=False):
        """Returns cluster partitions for specified number of clusters.

        Args:
            num_parts (int): number of cluster partitions
            return_max (boolean): if true return the maximum distance between body ids in one cluster
        Returns:
            (dict, dataframe): {cluster id: [body1, body2,...]}, "bodyid", "cluster id"
            optional (dict, dataframe, float, tuple): includes maximum distance between bodies in a cluster
            and those body ids
        """
        from scipy.cluster.hierarchy import cut_tree
        partitions = cut_tree(self.cluster, n_clusters=num_parts)
        res = {}

        labels = list(partitions[:,0])
        mapping = pd.DataFrame(list(zip(self.labels, labels)), columns=["bodyid", "type"])
        for idx, label in enumerate(labels):
            if label not in res:
                res[label] = []
            res[label].append(self.labels[idx])
        
        if return_max:
            max_dist, max_pair = self._get_max_dist(res)
            return (res, mapping, max_dist, max_pair)
        return (res, mapping)
Beispiel #26
0
def filter_points(gdf, min_dist=0, remove='first'):
    """
    Filter points in geodataframe using a minimum distance buffer

    Args
    ----
    gdf : Geopandas GeoDataFrame
        Containing point geometries

    min_dist : int or float, optional (default=0)
        Minimum distance by which to filter out closely spaced points

    remove : str, optional (default='first')
        Optionally choose to remove 'first' occurrences or 'last' occurrences

    Returns
    -------
    xy : 2d array-like
        Numpy array filtered coordinates
    """

	from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree
	
	xy = gdf.geometry.bounds.iloc[:, 0:2]
	Z = linkage(xy, 'complete')
	tree_thres = cut_tree(Z, height=min_dist)
	gdf['tree_thres'] = tree_thres
Beispiel #27
0
def clustering(pt, kmin, kmax, distance, linkagem):
    """Computes the clustering for the pathways matrix and returns a dataframe with the groups with k clusters from kmin to kmax."""
    tinit = time.time()
    if (distance == "binomial"):
        Y = pdist(
            pt.transpose(), binomial_dist
        )  # Computes the distance matrix. pt is transposed because the pdist function takes rows as input and what we want to cluster are the samples.
    else:
        Y = pdist(pt.transpose(), distance)
    Z = linkage(
        Y, linkagem
    )  #Linkage of the clusters using the distance matrix and the complete method.
    np.savetxt(
        "dist_matrix.csv", squareform(Y), delimiter=","
    )  #Saves the distance matrix. Note that the output of pdist is a condensed matrix!!

    #Building the output dataframe.
    df = pd.DataFrame()
    for k in range(kmin, kmax + 1):
        R = cut_tree(Z, k)  #Cuts the tree at k groups.
        u = [item for sublist in R for item in sublist]
        df["X" + str(k)] = u
        df.index = pt.columns
    tfin = time.time()
    print("Clustering runtime: ", tfin - tinit)
    return df.transpose()
Beispiel #28
0
    def get_partitions_dist_constraint(self, dist):
        """Returns cluster partitions for the specified distance constraint.

        Args:
            dist (float): maximum distance to allow between bodies in a cluter 
        Returns:
            (dict, dataframe, float, tuple): {cluster id: [body1, body2,...]}, "bodyid", "cluster id",
            distance between farthest bodies, farthest bodies in the same cluster
        """
        from scipy.cluster.hierarchy import cut_tree

        previous_result = None
        partitions = cut_tree(self.cluster)
        for colid in range(0, len(partitions[0,:])):
            labels = list(partitions[:,colid])
            mapping = pd.DataFrame(list(zip(self.labels, labels)), columns=["bodyid", "type"])
            res = {}

            for idx, label in enumerate(labels):
                if label not in res:
                    res[label] = []
                res[label].append(self.labels[idx])
        
            max_dist, max_pair = self._get_max_dist(res)
            if max_dist > dist:
                break
            previous_result = (res, mapping, max_dist, max_pair) 

        return previous_result
Beispiel #29
0
def get_cluster_assignments_hac(linkage_matrix: np.ndarray, height: int):
    """
    Assigns clusters by cutting the HAC dendrogram at the specified height.

    Returns list
    """
    return [x[0] for x in cut_tree(linkage_matrix, height=height)]
Beispiel #30
0
def cluster_hierarchy(data, method, axis, metric='euclidean', n_clusters=None):
    """
    data :pandas.DataFrame
        Rectangular data
    method :str, 'single', 'centroid', 'median', 'ward'
    axis : int, optional
        Which axis to use to calculate linkage. 0 is rows, 1 is columns.
    metric : "eulidean"
    n_cluster: int, optional
        return the cut tree.
    no_plot: bool, optional
        When True, the final rendering is not performed. This is
        useful if only the data structures computed for the rendering
        are needed or if matplotlib is not available.
    """
    data = data.copy()
    if axis == 1:
        data = data.T
    array = data.values
    _linkage = make_linkage(array, method, metric)

    if n_clusters is not None:
        cut_result = cut_tree(_linkage, n_clusters=n_clusters)
        df_cut = pd.DataFrame(cut_result.flatten())
        label = df_cut.iloc[:, 0].sort_values(ascending=True,
                                              inplace=False).index.values
        return data.index.values[label]
    _result = dendrogram(_linkage, no_plot=True)
    _reordered_index = data.index.values[_result['leaves']]
    return _reordered_index
Beispiel #31
0
def divide_clusters(global_distmat,
                    original_cluster,
                    max_cluster_size,
                    sub_clusters,
                    min_cluster_count=2):
    local_to_global_map = {
        ind: global_ind
        for ind, global_ind in enumerate(original_cluster)
    }

    local_distmat = global_distmat[original_cluster, :][:, original_cluster]
    local_disttriu = mat2triu(local_distmat)
    local_tree = linkage(local_disttriu, method='complete')
    local_heights = local_tree[:, 2]
    local_cluster_by_cutoff = cut_tree(
        local_tree, height=local_heights[-(min_cluster_count - 1)])
    num_clusters = np.max(local_cluster_by_cutoff) + 1

    for cluster_ind in range(num_clusters):
        sub_cluster = np.where(local_cluster_by_cutoff == cluster_ind)[0]
        sub_cluster_size = len(sub_cluster)

        print('sub_cluster_size = {}'.format(sub_cluster_size))

        sub_cluster_global_ind = np.array(
            [local_to_global_map[x] for x in sub_cluster])

        if sub_cluster_size > max_cluster_size:
            divide_clusters(global_distmat, sub_cluster_global_ind,
                            max_cluster_size, sub_clusters)
        else:
            sub_clusters.append(sub_cluster_global_ind)
Beispiel #32
0
def make_hier_clusters(df, n):
    '''
    Given dataframe of schools and attributes, use hierarchical clustering to
    find most similar schools.
    '''
    vectors = df.iloc[:, 2:].values
    vectors = scale(vectors)
    dist_matrix = squareform(pdist(vectors, metric='cosine'))
    link_matrix = linkage(dist_matrix, method='average')
    tree = cut_tree(link_matrix, n)
    return link_matrix, tree
Beispiel #33
0
def consensus_clustering(consensus, n_components=5):
    """
    :param consensus: cells x cells consensus matrix
    :param n_components: number of clusters
    :return: cells x 1 labels
    """
    print 'SC3 Agglomorative hierarchical clustering.'
    # condensed distance matrix
    cdm = dist.pdist(consensus)
    # hierarchical clustering (SC3: complete agglomeration + cutree)
    hclust = spc.complete(cdm)
    cutree = spc.cut_tree(hclust, n_clusters=n_components)
    labels = cutree.reshape(consensus.shape[0])
    # Below is the hclust code for the older version, fyi
    # hclust = spc.linkage(cdm)
    # labels = spc.fcluster(hclust, n_components, criterion='maxclust')
    return labels, dist.squareform(cdm)
Beispiel #34
0
def test_cut_tree():
    np.random.seed(23)
    nobs = 50
    X = np.random.randn(nobs, 4)
    Z = scipy.cluster.hierarchy.ward(X)
    cutree = cut_tree(Z)

    assert_equal(cutree[:, 0], np.arange(nobs))
    assert_equal(cutree[:, -1], np.zeros(nobs))
    assert_equal(cutree.max(0), np.arange(nobs - 1, -1, -1))

    assert_equal(cutree[:, [-5]], cut_tree(Z, n_clusters=5))
    assert_equal(cutree[:, [-5, -10]], cut_tree(Z, n_clusters=[5, 10]))
    assert_equal(cutree[:, [-10, -5]], cut_tree(Z, n_clusters=[10, 5]))

    nodes = _order_cluster_tree(Z)
    heights = np.array([node.dist for node in nodes])

    assert_equal(cutree[:, np.searchsorted(heights, [5])], cut_tree(Z, height=5))
    assert_equal(cutree[:, np.searchsorted(heights, [5, 10])], cut_tree(Z, height=[5, 10]))
    assert_equal(cutree[:, np.searchsorted(heights, [10, 5])], cut_tree(Z, height=[10, 5]))

# Compute distance metrics on standardized data
# This will likely generate an error on most machines
# d = dist(new_data)

# Take a 10% sample
customers_sample = customers.iloc[::10, :]
new_data_sample  = new_data.iloc[::10, :]

# Compute distance metrics on standardized data
#d = pdist(new_data_sample)
#not needed for ward

# Perform hierarchical clustering on distance metrics
c = linkage(new_data_sample, method='ward')

# Plot the dendogram
dendrogram(c, get_leaves=True, labels=None)

# Cut at 9 segments
members = pd.DataFrame(cut_tree(c, n_clusters = 9), index=new_data_sample.index, columns=['ClusterNumber'])

# Show 30 first customers, frequency table
members.iloc[0:30]
members.ClusterNumber.value_counts(sort=False)

# Show profile of each segment
customers_sample_new = customers_sample.set_index(customers_sample.customer_id).iloc[:,1:4]
customers_sample_new.groupby(members.ClusterNumber).mean()