Ejemplos de linkage en Python, ejemplos de fastcluster.linkage en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: inchlib_clust_dev.py Proyecto: AlfiyaZi/InCHlib.js

    def cluster_data(self, row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"):
        """Performs clustering according to the given parameters.
        @datatype - numeric/binary
        @row_distance/column_distance - see. DISTANCES variable
        @row_linkage/column_linkage - see. LINKAGES variable
        @axis - row/both
        """
        print("Clustering rows:", row_distance, row_linkage)
        self.clustering_axis = axis
        row_linkage = str(row_linkage)
        
        if row_linkage in RAW_LINKAGES:
            self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance)

        else:
            self.distance_vector = fastcluster.pdist(self.data, row_distance)

            if self.datatype == "numeric" and not row_distance in DISTANCES[self.datatype]:
                raise Exception("".join(["When clustering numeric data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])]))
            elif (self.datatype == "binary" or self.datatype == "nominal") and not row_distance in DISTANCES[self.datatype]:
                raise Exception("".join(["When clustering binary or nominal data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])]))

            self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage))


        if not self.missing_value is False:
            self.data = self.__return_missing_values__(self.data, self.missing_values_indexes)
        self.column_clustering = []

        if axis == "both" and len(self.data[0]) > 2:
            print("Clustering columns:", column_distance, column_linkage)
            self.__cluster_columns__(column_distance, column_linkage)
        
        if self.write_original or self.datatype == "nominal":
            self.data = self.original_data

Ejemplo n.º 2

0

Mostrar archivo

Archivo: inchlib_clust.py Proyecto: fw1121/galaxy_tools

    def cluster_data(self, data_type="numeric", row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"):
        """Performs clustering according to the given parameters.
        @data_type - numeric/binary
        @row_distance/column_distance - see. DISTANCES variable
        @row_linkage/column_linkage - see. LINKAGES variable
        @axis - row/both
        """
        
        print("Clustering rows:", row_distance, row_linkage)
        self.data_type = data_type
        self.clustering_axis = axis
        row_linkage = str(row_linkage)
        
        if row_linkage in RAW_LINKAGES:
            self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance)

        else:
            self.distance_vector = fastcluster.pdist(self.data, row_distance)

            if data_type in DISTANCES and not row_distance in DISTANCES[data_type]:
                raise Exception("".join(["When clustering" , data_type, "data you must choose from these distance measures: ", ", ".join(DISTANCES[data_type])]))
            elif not data_type in DISTANCES.keys():
                raise Exception("".join(["You can choose only from data types: ", ", ".join(DISTANCES.keys())]))

            self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage))

        self.column_clustering = []
        if axis == "both" and len(self.data[0]) > 2:
            print("Clustering columns:", column_distance, column_linkage)
            self.__cluster_columns__(column_distance, column_linkage)

        if self.write_original:
            self.data = self.original_data

        return

Ejemplo n.º 3

0

Mostrar archivo

Archivo: clustering.py Proyecto: verajohne/treeCl

    def hierarchical(
        self,
        nclusters,
        linkage_method,
        noise=False,
        ):

        if noise:
            matrix = self.distance_matrix.add_noise()
        else:
            matrix = self.distance_matrix

        linkmat = linkage(squareform(matrix), linkage_method)
        linkmat_size = len(linkmat)
        if nclusters <= 1:
            br_top = linkmat[linkmat_size - nclusters][2]
        else:
            br_top = linkmat[linkmat_size - nclusters + 1][2]
        if nclusters >= len(linkmat):
            br_bottom = 0
        else:
            br_bottom = linkmat[linkmat_size - nclusters][2]
        threshold = 0.5 * (br_top + br_bottom)
        T = fcluster(linkmat, threshold, criterion='distance')
        return Partition(T)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: dendragramBuilder.py Proyecto: rojosewe/TesisScrips

def saveLinkage(distanceMatrix):
#     link = linkage(distanceMatrix, 'ward')
    link = fastcluster.linkage(distanceMatrix, method='ward') # D-distance matrix
    afile = open(structfolder + 'wardlinkage.pkl', 'wb')
    pickle.dump(link, afile);
    afile.close();
    return link

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_clustering.py Proyecto: pombredanne/bo-python

    def test_basic_clustering(self):
        data = [
            [1.0, 2.0],
            [2.0, 1.0],
            [2.1, 1.1],
            [2, 1.1],
            [1.0, 2.1],
        ]
        data = np.array(data)

        dist = fastcluster.pdist(data)
        result = fastcluster.linkage(dist).tolist()

        assert_that(int(result[0][0])).is_equal_to(0)
        assert_that(int(result[0][1])).is_equal_to(4)
        assert_that(result[0][2]).is_close_to(0.1, 0.00001)
        assert_that(int(result[0][3])).is_equal_to(2)

        assert_that(int(result[1][0])).is_equal_to(1)
        assert_that(int(result[1][1])).is_equal_to(3)
        assert_that(result[1][2]).is_close_to(0.1, 0.00001)
        assert_that(int(result[1][3])).is_equal_to(2)

        assert_that(int(result[2][0])).is_equal_to(2)
        assert_that(int(result[2][1])).is_equal_to(6)
        assert_that(result[2][2]).is_close_to(0.1, 0.00001)
        assert_that(int(result[2][3])).is_equal_to(3)

        assert_that(int(result[3][0])).is_equal_to(5)
        assert_that(int(result[3][1])).is_equal_to(7)
        assert_that(result[3][2]).is_close_to(1.34536, 0.00001)
        assert_that(int(result[3][3])).is_equal_to(5)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: tweet_proc.py Proyecto: gaphex/Oracle

def run_entity_model(cdev, cprc):
    print '____________________________________________________'
    print 'running entity model'
    hdev, hprc, hmapping, entcorp, er = process_entities(cdev, cprc)
    print 'removed', len(cdev)- len(hdev), 'documents', len(hdev), 'left'
    voc = build_voc(entcorp, 2)
    
    ent_vectorizer = CountVectorizer(vocabulary = voc)
    E = ent_vectorizer.fit_transform(hdev)
    
    Eclean, emapping = filter_rare(E, 0)

    E_dense = np.matrix(Eclean).astype('float')
    E_scaled = preprocessing.scale(E_dense)
    E_normalized = preprocessing.normalize(E_scaled, norm='l2')
    
    EMatrix = pairwise_distances(E_normalized, metric='cosine')
    EL = fastcluster.linkage(EMatrix, method='average')
    flat_eclust = hierarchy.fcluster(EL, 0.5, 'distance')
    ec = organize_clusters(flat_eclust, th = 3)
    
    ecf = []
    for cl in ec:
        ecf.append([hmapping[emapping[t]] for t in cl])
    print 'detected', len(ecf), 'entity clusters'      
    return ecf, voc

Ejemplo n.º 7

0

Mostrar archivo

Archivo: tweet_proc.py Proyecto: gaphex/Oracle

def run_ngram_model(cdev, cprc):
    print '____________________________________________________'
    print 'running n-gram model'
    wcorp = []
    for i in cprc:
        wcorp.append(' '.join(cprc[i]['words']))
        
    vectorizer = CountVectorizer(analyzer='word', binary=True, min_df=max(int(len(wcorp)*0.0005), 5), ngram_range=(2,3))
    X = vectorizer.fit_transform(wcorp)
    Xclean, mapping = filter_rare(X)
    
    Xdense = np.matrix(Xclean).astype('float')
    X_scaled = preprocessing.scale(Xdense)
    X_normalized = preprocessing.normalize(X_scaled, norm='l2')
    
    textMatrix = pairwise_distances(X_normalized, metric='cosine')
    L = fastcluster.linkage(textMatrix, method='average')
    flat_textclust = hierarchy.fcluster(L, 0.5, 'distance')
    ttc = organize_clusters(flat_textclust)
    
    ncf = []
    for cl in ttc:
        ncf.append([mapping[t] for t in cl])
    print 'detected', len(ncf), 'n-gram clusters'     
    return ncf

Ejemplo n.º 8

0

Mostrar archivo

Archivo: SourceClusterer.py Proyecto: IamChangCC/joern_plus_plus

    def cluster(self):
        
        # We cluster for each argument independently!        
        retval = ClusterResult()
        
        curOffset = 0
        argNum = 0
        for symbolsForArg in self.contentProvider.getSourceAPISymbols():
            D = self._calculateDistanceMatrix(symbolsForArg)

            curOffset = len(retval.clusterIdToDatapoint.keys())

            if len(symbolsForArg) == 0:
                argNum += 1
                continue
            
            if len(symbolsForArg) == 1:
                retval.register(curOffset, symbolsForArg[0], argNum)
                argNum += 1
                continue
            
            Z = linkage(D, method=self.linkageMethod)
            clustering = fcluster(Z, self.maxDistInCluster, criterion = 'distance')
            
            retval.registerSet(symbolsForArg, clustering, curOffset, argNum)
            argNum += 1
        
        return retval

Ejemplo n.º 9

0

Mostrar archivo

Archivo: phylo.py Proyecto: Xbar/PhyloAnalysis

 def hclust(self):
     link_file = self.datafile + '.link.npy'
     if os.path.isfile(link_file) and os.path.getmtime(link_file) >= os.path.getmtime(self.datafile):
         self.link_matrix = np.load(link_file)
     else:
         blast_score = self.normalized.as_matrix()
         self.link_matrix = fastcluster.linkage(blast_score, method='average', 
                                                metric='correlation', 
                                                preserve_input=False)
         del blast_score
         np.save(link_file, self.link_matrix)
         
     self.gene_num = self.normalized.shape[0]
     self.node_num = self.gene_num + self.link_matrix.shape[0]
     self.parent_tree = np.array(np.arange(self.node_num))
     self.leaf_num = np.array([1] * self.gene_num + 
                              [0] * (self.node_num - self.gene_num))
     for i in range(self.link_matrix.shape[0]):
         assert(self.parent_tree[self.link_matrix[i, 0]] == int(self.link_matrix[i, 0]))
         assert(self.parent_tree[self.link_matrix[i, 1]] == int(self.link_matrix[i, 1]))
         assert(self.leaf_num[self.gene_num + i] == 0)
         self.parent_tree[self.link_matrix[i, 0]] = self.gene_num + i
         self.parent_tree[self.link_matrix[i, 1]] = self.gene_num + i
         self.leaf_num[i + self.gene_num] = self.leaf_num[self.link_matrix[i, 0]] + \
                                         self.leaf_num[self.link_matrix[i, 1]]

Ejemplo n.º 10

0

Mostrar archivo

Archivo: mask_clustering.py Proyecto: jizhihang/anytime_recognition

def training_predict(X, K):
    """
    Get unique masks and cluster indices on the training set.

    Parameters
    ----------
    X : (N, F) ndarray of boolean

    Returns
    -------
    umasks : (UK, F) ndarray of bool

    cluster_ind : (N,) ndarray of int
        Each cluster ind is [0, K'), with K' <= K,
        or [0, UK) if K == -1 or K >= UK.
    """
    umasks = tc.mask_distribution.get_unique_masks(X)
    UK = umasks.shape[0]
    if K < 0 or K >= UK:
        cluster_ind = np.zeros(X.shape[0], dtype=int)
        for i in range(1, UK):
            cluster_ind[(X == umasks[i]).all(1)] = i
    else:
        Z = fastcluster.linkage(X, method="single", metric="hamming")
        cluster_ind = fcluster(Z, K, criterion="maxclust") - 1
    return umasks, cluster_ind

Ejemplo n.º 11

0

Mostrar archivo

Archivo: cluster_images.py Proyecto: datamade/mecharat

def writeClusters(results):
    threshold = 0.9
    results = numpy.fromiter(results, dtype=[('pairs', 'i8', 2), ('score', 'f4', 1,)])
    i_to_id, condensed_distances, N = condensedDistance(results)
    linkages = fastcluster.linkage(condensed_distances, method='ward')
    partition = hcluster.fcluster(linkages, threshold, criterion='inconsistent')
    clusters = {}
    for (i, cluster_id) in enumerate(partition):
        clusters.setdefault(cluster_id, []).append(i_to_id[i])
    i = 0
    for cluster in clusters.values():
        images = []
        for index in cluster:
            image_name = all_images[index]
            image_path = os.path.join(imagedir, image_name)
            cluster_path = 'clustered_images/{0}'.format(str(i))

            # There must be a better way to do this
            try:
                os.mkdir(cluster_path)
            except OSError:
                for f in os.listdir(cluster_path):
                    try:
                        os.remove(f)
                    except OSError:
                        pass
            print('writing %s' % image_name)
            with open(image_path, 'rb') as inp:
                with open(os.path.join('clustered_images', str(i), image_name), 'wb') as outp:
                    outp.write(inp.read())
        i += 1

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test.py Proyecto: genome-vendor/r-cran-fastcluster

def test_all():
  D2 = D.copy()
  for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']:
    Z2 = fc.linkage(D, method)
    if np.any(D2!=D):
      raise AssertionError('Input array was corrupted.')
    test(Z2, method)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: cluster_count.py Proyecto: parpwhick/Distanze_Entropiche

def hist_per_stagione(start=1992, end=2012):
	stagione=(all_labels > start) & (all_labels < end)
	dist_selected=dist[ix_(stagione,stagione)]
	Z=linkage(squareform(dist_selected),method='complete')
	n=choose_p(Z)
	c=fcluster(Z,n,criterion='maxclust')-1

	label_anni=all_labels[stagione]
	#order by first appearance!
	first_appearance=[]
	for i in range(0,n):
	    first_appearance.append(min(label_anni[c==i]))

	order1=[index for key,index in sorted(zip(first_appearance,range(0,n)))]
	order2=[index for key,index in sorted(zip(order1,range(0,n)))]
	order=array(order2)
	c=order[c]

	#draw scatter plot
	scatter(label_anni,c,s=100,c=c)
	#grid(b=True,axis='y')
	yticks(range(0,n+1))
	xlim((min(label_anni)-0.5,max(label_anni)+0.5))
	ax=gca()
	for i in range(1993,2011+1):
		ax.add_line(Line2D([i+7./12,i+7./12],[0,n+1],linestyle='--'))
	show()

Ejemplo n.º 14

0

Mostrar archivo

Archivo: explore_fns.py Proyecto: gideonite/bf2

def hierarchical_clust(d, cluster_method='average'):
    if VERBOSE:
        print 'Doing hierarchical clustering using fastcluster!'
    # some might say this function is redundant
    # d should be a distance vector
    Z = fastcluster.linkage(d, method=cluster_method)
    return Z

Ejemplo n.º 15

0

Mostrar archivo

Archivo: HAC.py Proyecto: yaelic/LinkedinPath

def clusterAndDendogrgam(Metrix):
    out = fastcluster.linkage(Metrix, method="single", metric="euclidean", preserve_input=True)
    plt.plot()
    dend = augmented_dendrogram(
        out,
        p=30,
        truncate_mode=None,
        color_threshold=None,
        get_leaves=True,
        orientation="top",
        labels=None,
        count_sort=False,
        distance_sort=False,
        show_leaf_counts=True,
        no_plot=False,
        no_labels=False,
        color_list=None,
        leaf_font_size=None,
        leaf_rotation=None,
        leaf_label_func=None,
        no_leaves=False,
        show_contracted=False,
        link_color_func=None,
    )
    plt.show()
    return out

Ejemplo n.º 16

0

Mostrar archivo

Archivo: plots.py Proyecto: aheilbut/alib

def clusterHeatmap(df, title, row_label_map, col_label_map, colormap=my_cmap, 
                   cluster_rows=False, cluster_columns=False, cluster_data=None,
                   row_dendrogram=False, column_dendrogram=False, width=30, height=20, vmin=-3, vmax=3, distmethod="correlation", colorbar=True, colorbar_shrink=0.2, label_values=False):

    cm = pylab.get_cmap(colormap)
    cm.set_bad("0.9")

    # do clustering 
    if cluster_data is None:
        cluster_data = df # cluster the same data that we are plotting    

    matplotlib.rcParams['figure.figsize'] = [width, height]    
    #    pylab.figsize(20, 10)
    pylab.title(title)
#    pylab.text(0,-5,str(datetime.date.today()))
    
    # ylabels = [genesym[geneid] for geneid in pt.axes[0][Z['leaves']]]
    #  xlabels = pt.axes[1][cZ['leaves']]
    
    orderedVal = df
    
    if cluster_rows:
        distances = scipy.cluster.hierarchy.distance.pdist(cluster_data.values, distmethod)
        rowY = fastcluster.linkage(distances)
        rowZ = scipy.cluster.hierarchy.dendrogram(rowY, orientation='right', no_plot=True)
        orderedVal = df.reindex(index=df.axes[0][rowZ['leaves']])

        
    if cluster_columns:
        coldist = scipy.cluster.hierarchy.distance.pdist(df.values.transpose(), distmethod)
        cY = scipy.cluster.hierarchy.linkage(coldist)
        cZ = scipy.cluster.hierarchy.dendrogram(cY, no_plot=True)    
        orderedVal = orderedVal.reindex(columns=df.axes[1][cZ['leaves']])
    
    # row labels 
    if row_label_map is not None:
        pylab.yticks(range(0, len(orderedVal.index)), [row_label_map[i] for i in orderedVal.index])        
    else:
        pylab.yticks(range(0, len(orderedVal.index)), orderedVal.index)
    pylab.xticks(range(0, len(orderedVal.columns)), orderedVal.columns, rotation=90)

    
    if col_label_map is not None:
        pylab.xticks(range(0, len(orderedVal.columns)), [col_label_map[i] for i in orderedVal.columns])                
    

    if label_values:
        cmatrix = orderedVal.as_matrix()
        for x in range(cmatrix.shape[0]):
            for y in range(cmatrix.shape[1]):
                if cmatrix[x, y] >= 0:
                    pylab.text(y, x, "%.1f" % cmatrix[x,y], horizontalalignment='center',
                 verticalalignment='center')        
    
    #orderedVal = orderedVal[:,]
    pylab.tick_params(direction="out")
    pylab.imshow(orderedVal, interpolation="nearest", cmap=cm, aspect='auto', norm=None, vmin=vmin, vmax=vmax)
    if colorbar:
        pylab.colorbar(shrink=colorbar_shrink)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test_matrix.py Proyecto: BrianLandry/seaborn

    def test_fastcluster_other_method(self):
        import fastcluster

        kws = self.default_kws.copy()
        kws["method"] = "average"
        linkage = fastcluster.linkage(self.x_norm.T, method="average", metric="euclidean")
        p = mat._DendrogramPlotter(self.x_norm, **kws)
        npt.assert_array_equal(p.linkage, linkage)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: analyse.py Proyecto: saeedsaberi/Python

def heatmap_cor( x, vec, minval, maxval ):


# Compute and plots heatmap & dendrogram.
  norm,corr,dist=analyse.all_corr(vec)

  print 'statrting to cluster...'
  fig = plt.figure(figsize=(8,8))
  ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
  z=fastcluster.linkage(dist, method='complete')
  #z=fastcluster.linkage(norm,metric='euclidean', method='ward')
  print 'clustering done, drawing the dendogram' 
  Z1 = sch.dendrogram(z, labels=x,orientation='right')
  del norm
  del dist

  plt.yticks(fontsize=8)
  #ax1.set_yticks([])
  ticks = ax1.get_xticks() #/ max(ax1.get_xticks())
  ticks=map(float,ticks)
  ticks = ['%.2f' % (a/2.) for a in ticks]
  ax1.set_xticklabels(ticks)
  
# Plot distance matrix.
  axmatrix = fig.add_axes([0.4,0.1,0.5,0.6])
  axmatrix.set_xticks([])
  axmatrix.set_yticks([])

  axmatrix.xaxis.tick_top()
  axmatrix.set_frame_on(False)
  idx1 = Z1['leaves']
  idx2 = Z1['leaves']
  xx=[]
  for i in idx1:
      xx.append(x[int(i)]) 
  D = corr[idx1,:]
  D = D[:,idx2]

  print 'heatmap' 
  im = axmatrix.pcolor(D,  cmap=plt.cm.RdYlBu,edgecolor='k',)
  plt.xticks(fontsize=5)
  plt.yticks([])

  xx=[]
  for i in idx1:
      xx.append(x[int(i)])   
   
  
  #plt.yticks(np.arange(len(x)),xx,fontsize = 12)
  plt.xticks(np.arange(len(x)),xx)
  plt.xticks(rotation=90)
  plt.xticks(fontsize=8)

  axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
  plt.colorbar(im, cax=axcolor)
  fig.show()  

  return xx

Ejemplo n.º 19

0

Mostrar archivo

Archivo: inchlib_clust.py Proyecto: fw1121/galaxy_tools

 def __cluster_columns__(self, column_distance, column_linkage):
     columns = zip(*self.data)
     self.column_clustering = fastcluster.linkage(columns, method=column_linkage, metric=column_distance)
     self.data_order = hcluster.leaves_list(self.column_clustering)
     self.data = self.__reorder_data__(self.data, self.data_order)
     self.original_data = self.__reorder_data__(self.original_data, self.data_order)
     if self.header:
         self.header = self.__reorder_data__([self.header], self.data_order)[0]
     return

Ejemplo n.º 20

0

Mostrar archivo

Archivo: nantest.py Proyecto: dmuellner/fastcluster

def test():
    n = np.random.randint(2,100)

    # Part 1: distance matrix input

    N = n*(n-1)//2
    D = np.random.rand(N)
    # Insert a single NaN value
    pos = np.random.randint(N)
    D[pos] = np.nan

    for method in ['single', 'complete', 'average', 'weighted', 'ward',
                   'centroid', 'median']:
        try:
            fastcluster.linkage(D, method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    # Next: the original array does not contain a NaN, but a NaN occurs
    # as an updated distance.
    for method in ['average', 'weighted', 'ward', 'centroid', 'median']:
        try:
            fastcluster.linkage([np.inf,-np.inf,-np.inf], method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    # Part 2: vector input

    dim = np.random.randint(2,13)
    X = np.random.rand(n,dim)
    pos = (np.random.randint(n), np.random.randint(dim))
    # Insert a single NaN coordinate
    X[pos] = np.nan

    for method in ['single', 'ward', 'centroid', 'median']:
        try:
            fastcluster.linkage_vector(X, method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    return True

Ejemplo n.º 21

0

Mostrar archivo

Archivo: clustering.py Proyecto: lilipeng/msmbuilder

    def __init__(self, metric, trajectories, method='single', precomputed_values=None):
        """Initialize a hierarchical clusterer using the supplied distance
        metric and method.

        Method should be one of the fastcluster linkage methods,
        namely 'single', 'complete', 'average', 'weighted', 'centroid', 'median',
        or 'ward'.

        Parameters
        ----------
        metric : msmbuilder.metrics.AbstractDistanceMetric
            A metric capable of handling `ptraj`
        trajectory : Trajectory list of Trajectorys
            data to cluster
        method : {'single', 'complete', 'average', 'weighted', 'centroid',
                  'median', 'ward'}
        precomputed_values :
            used internally to implement load_from_disk()

        Notes
        -----
        This is implemenred with the fastcluster library, which can be downloaded
        from CRAN http://cran.r-project.org/web/packages/fastcluster/
        """

        if precomputed_values is not None:
            precomputed_z_matrix, traj_lengths = precomputed_values
            if isinstance(precomputed_z_matrix, np.ndarray) and precomputed_z_matrix.shape[1] == 4:
                self.Z = precomputed_z_matrix
                self.traj_lengths = traj_lengths
                return
            else:
                raise Exception('Something is wrong')

        if not isinstance(metric, metrics.AbstractDistanceMetric):
            raise TypeError('%s is not an abstract distance metric' % metric)
        if not method in self.allowable_methods:
            raise ValueError("%s not in %s" % (method, str(self.allowable_methods)))
        if isinstance(trajectories, md.Trajectory):
            trajectories = [trajectories]
        elif isinstance(trajectories, types.GeneratorType):
            trajectories = list(trajectories)


        self.traj_lengths = np.array([len(t) for t in trajectories])
        # self.ptrajs = [self.metric.prepare_trajectory(traj) for traj in self.trajectories]

        logger.info('Preparing...')
        flat_trajectory = concatenate_trajectories(trajectories)
        pflat_trajectory = metric.prepare_trajectory(flat_trajectory)

        logger.info('Getting all to all pairwise distance matrix...')
        dmat = metric.all_pairwise(pflat_trajectory)
        logger.info('Done with all2all')
        self.Z = fastcluster.linkage(dmat, method=method, preserve_input=False)
        logger.info('Got Z matrix')

Ejemplo n.º 22

0

Mostrar archivo

Archivo: extract.py Proyecto: bitliner/Automatic-Extraction-of-Most-Relevant-Insights-From-Customer-Reviews

def cluster(data_vecs, method='average', metric='cosine', save=True):
    print "Calculating the linkage matrix, metric = {0}, method = {1}".format(metric, method)
    links = fc.linkage(data_vecs, metric=metric,method=method)

    # if save:
    #     print "Saving the model to: results/" + filename + "/linkage"
    #     file = open('results/' + filename + '_linkage', 'wb')
    #     pickle.dump(links, file)
    #     file.close()
    return links

Ejemplo n.º 23

0

Mostrar archivo

Archivo: test_matrix.py Proyecto: petebachant/seaborn

    def test_fastcluster_non_euclidean(self):
        import fastcluster

        kws = self.default_kws.copy()
        kws['metric'] = 'cosine'
        kws['method'] = 'average'
        linkage = fastcluster.linkage(self.x_norm.T, method=kws['method'],
                                      metric=kws['metric'])
        p = mat._DendrogramPlotter(self.x_norm, **kws)
        npt.assert_array_equal(p.linkage, linkage)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: cluster.py Proyecto: alexdrozdov/asr-fsm-mental-asr

 def cluster(self, cluster_count = None, cluster_radius = 10.0):
     x = self.x
     nx = x.shape[0]
     D=pdist(x)
     l = fc.linkage(D,'single')
     l0 = numpy.hstack((x,x, numpy.zeros((nx,1)), numpy.ones((nx,1))))
         
     self._ct = ClusterTree(l0, l)
     self._ct.find_groups(cluster_radius)
     self._ct.sort_groups()

Ejemplo n.º 25

0

Mostrar archivo

Archivo: wordMatrixBuilder.py Proyecto: rojosewe/TesisScrips

def buildClusters(featMatrix):
    distanceMatrix = pdist(featMatrix[:,:], metric='jaccard')
    pickleSave(structsfolder + 'distanceMatrix.pkl', distanceMatrix)
#     ed = euclidean_distances(featMatrix[1:100,:], featMatrix[1:100,:])
    linkage = fastcluster.linkage(distanceMatrix, method='ward') # D-distance matrix
#     fc = fcluster(link, 30, criterion='maxclust')
    #R = dendrogram(link, color_threshold=0.3, leaf_font_size=6)
    #pylab.savefig( "/home/rojosewe/Dropbox/MAI90/tesis/images/wordClustering/featMatrix.png" )
    distanceMatrix = None
    pickleSave(structsfolder + 'linkage.pkl', linkage)

Ejemplo n.º 26

0

Mostrar archivo

Archivo: hierarchicalAlgorithm.py Proyecto: migonsu/pyProCT

    def perform_clustering(self, kwargs):
        """
        Performs the hierarchical clustering step and the clustering step. If the hierarchical
        matrix is given, then it just calculates the clusters for a given cutoff. If we call the algorithm
        a second time it will use the last matrix.
        """
        """
        Gets a condensed matrix and calculates the clustering. One can use
        diverse methodologies to do this clustering...
        With preserve_input=False the matrix is destroyed while clustering, ut it saves
        memory.
        The metric is not needed in this case,as we are giving the function the calculated
        matrix.
        The method is the method used to determine distances when fusing clusters. methods are described in:
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
        """
        try:
            cutoff = kwargs["cutoff"]
        except KeyError:
            cutoff = None

        try:
            hie_mat = kwargs["hie_mat"]
        except KeyError:
            hie_mat = None

        try:
            method = kwargs["method"]
        except KeyError:
            method = 'complete'

        if hie_mat != None:
            self.hie_mat = hie_mat
#            print "[HIERARCHICAL] Matrix provided."
        else:
            if self.hie_mat == None:
                #self.hie_mat = fast_hcluster.linkage(condensed_matrix, method='centroid', metric='euclidean', preserve_input=False)
#                print "[HIERARCHICAL] Calculating Matrix"
                #self.hie_mat = fastclust.linkage(self.condensed_matrix.get_data(), method = method)
                self.hie_mat = hcluster_fast.linkage(self.condensed_matrix.get_data(), method = method)
#            else:
#                print "[HIERARCHICAL] Matrix was already stored"

        algorithm_details = "Hierarchical with "+method+" method (cutoff = " +str(cutoff)+")"

        if cutoff != None:
            # Then apply the cutoff, this doesn't work much as expected
#            print "[HIERARCHICAL] getting clustering."+algorithm_details
            group_list = hcluster.fcluster(self.hie_mat,cutoff)
#            print "[HIERARCHICAL] Clustering done."+algorithm_details
            # Then let's generate the clusters
            clusters = gen_clusters_from_class_list(group_list)
            return Clustering(clusters,details = algorithm_details)
        else:
            return None

Ejemplo n.º 27

0

Mostrar archivo

Archivo: hclust_cutree.py Proyecto: peterk87/sistr_cmd

def complete_linkage(dm):
    """
    Perform complete linkage hierarchical clustering on a distance matrix.

    Args:
        dm (numpy.array): Distance matrix

    Returns:
        (object): fastcluster complete linkage hierarchical clustering object
    """
    return linkage(dm, 'complete')

Ejemplo n.º 28

0

Mostrar archivo

Archivo: cluster.py Proyecto: slipguru/icing

def define_clusts(similarity_matrix, threshold=0.05, max_iter=200,
                  method='ap'):
    """Define clusters given the similarity matrix and the threshold."""
    n, labels = connected_components(similarity_matrix, directed=False)
    prev_max_clust = 0
    print("connected components: %d" % n)
    clusters = labels.copy()

    if method == 'dbscan':
        ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1)
    if method == 'ap':
        ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter,
                                 preference='median')

    for i in range(n):
        idxs = np.where(labels == i)[0]
        if idxs.shape[0] > 1:
            sm = similarity_matrix[idxs][:, idxs]
            sm += sm.T + scipy.sparse.eye(sm.shape[0])

            # Hierarchical clustering
            if method == 'hc':
                dists = squareform(1 - sm.toarray())
                links = fastcluster.linkage(dists, method='ward')
                try:
                    clusters_ = fcluster(links, threshold, 'distance')
                except ValueError as err:
                    logging.critical(err)
                    clusters_ = np.zeros(1, dtype=int)

            # DBSCAN
            elif method == 'dbscan':
                db = ap.fit(1. - sm.toarray())
                # Number of clusters in labels, ignoring noise if present.
                clusters_ = db.labels_
                # n_clusters_ = len(set(clusters_)) - int(0 in clusters_)

            # AffinityPropagation
            # ap = AffinityPropagation(affinity='precomputed')
            elif method == 'ap':
                db = ap.fit(sm)
                clusters_ = db.labels_
            else:
                raise ValueError("clustering method %s unknown" % method)

            if np.min(clusters_) == 0:
                clusters_ += 1
            clusters_ += prev_max_clust
            clusters[idxs] = clusters_
            prev_max_clust = max(clusters_)
        else:  # connected component contains just 1 element
            prev_max_clust += 1
            clusters[idxs] = prev_max_clust
    return np.array(extra.flatten(clusters))

Ejemplo n.º 29

0

Mostrar archivo

Archivo: clustering.py Proyecto: BenjaminJonMiller/dedupe

def cluster(dupes, threshold=.5, max_components=30000):
    '''
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    '''
    threshold = 1 - threshold

    dupe_sub_graphs = connected_components(dupes, max_components)

    clustering = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:

            (i_to_id, condensed_distances) = condensedDistance(sub_graph)
            N = max(i_to_id) + 1

            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid', 
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage, 
                                          threshold,
                                          criterion='distance')

            clusters = {}

            for (i, sub_cluster_id) in enumerate(partition):
                clusters.setdefault(cluster_id + sub_cluster_id, []).append(i)

            cophenetic_distances = hcluster.cophenet(linkage)

            for cluster_id, items in clusters.iteritems() :
                if len(items) > 1 :
                    score = clusterConfidence(items, cophenetic_distances, N)
                    clustering[cluster_id] = (tuple(i_to_id[item] 
                                                    for item in items),
                                              1 - score)

            cluster_id += max(partition) + 1
        else:
            ids, score = sub_graph[0]
            clustering[cluster_id] = tuple(ids), score
            cluster_id += 1
            

    return clustering.values()

Ejemplo n.º 30

0

Mostrar archivo

Archivo: clustering.py Proyecto: DessimozLab/treeCl

    def _hclust(self, nclusters, method, noise=False):
        """
        :param nclusters: Number of clusters to return
        :param linkage_method: single, complete, average, ward, weighted, centroid or median
                               (http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html)
        :param noise: Add Gaussian noise to the distance matrix prior to clustering (bool, default=False)
        :return: Partition object describing clustering
        """
        matrix = self.get_dm(noise)

        linkmat = fastcluster.linkage(squareform(matrix), method)
        return _hclust(linkmat, nclusters)

Ejemplo n.º 31

0

Mostrar archivo

from machinelearning import datasetselection, featureselection
import machinelearning.dataclasses as dc
import pickle
from operator import itemgetter
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as hierarchy
import scipy.spatial.distance as distance
import fastcluster

# load distance matrix
with open('gene_gene_matrix_euclidean_distance_from_projection.pickle',
          'rb') as fr:
    gene_gene = pickle.load(fr)

# prefer ward linkage for euclidean distance or at least this case
lnk = fastcluster.linkage(distance.squareform(gene_gene.matrix, checks=False),
                          'ward')
si = hierarchy.leaves_list(lnk).astype('int64')

# load projection
with open('gene_atb_matrix_2d_dnn_projection.pickle', 'rb') as fr:
    gene_proj = pickle.load(fr)
if ~(gene_proj.rowlabels == gene_gene.rowlabels).all():
    raise ValueError('genes not aligned')
gene_proj.reorder(si, 0)
ordered_genes = gene_proj.rowlabels.copy()
del gene_gene, lnk, si

# select datasets
dataset_info = datasetselection.finddatasets(getalllevels=True)
included_datasetabbrevs = {
    'clinvar', 'dbgap_cleaned', 'gad', 'gadhighlevel_cleaned', 'gobp', 'gocc',

Ejemplo n.º 32

0

Mostrar archivo

Archivo: tissue_analysis.py Proyecto: fluentlydata/singlecell-dash

def diff_exp_clusters(cluster_expression_df, cluster_sizes, file_format):
    n_clusters = len(cluster_sizes)

    cluster_sum_umi = np.vstack([
        cluster_sizes[c] *
        cluster_expression_df[f'Cluster {c} mean UMI'].values
        for c in range(n_clusters)
    ])

    cluster_ssq_umi = np.vstack([
        cluster_sizes[c] *
        (cluster_expression_df[f'Cluster {c} std UMI'].values**2 +
         cluster_expression_df[f'Cluster {c} mean UMI'].values**2)
        for c in range(n_clusters)
    ])

    Z = fastcluster.linkage(cluster_sum_umi, method='average', metric='cosine')

    fig = matplotlib.figure.Figure(figsize=(12, 12))

    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    scipy.cluster.hierarchy.dendrogram(Z,
                                       ax=ax,
                                       color_threshold=0,
                                       above_threshold_color='grey')

    ax.set_title('Hierarchical structure of cell-type clusters')
    ax.set_xlabel('Cluster Label')
    ax.tick_params(labelleft='off')

    FigureCanvasAgg(fig).print_figure(file_format.format('dendrogram', 'png'))

    root, rd = scipy.cluster.hierarchy.to_tree(Z, rd=True)

    def de(lbl_1, lbl_2, group1, group2):
        print(f'Comparing {group1} to {group2}')

        group1_n_cells = sum(cluster_sizes[c] for c in group1)
        group2_n_cells = sum(cluster_sizes[c] for c in group2)

        group1_mean = cluster_sum_umi[group1, :].sum(axis=0) / group1_n_cells
        group2_mean = cluster_sum_umi[group2, :].sum(axis=0) / group2_n_cells

        mean_diff = group1_mean - group2_mean

        group1_var = (cluster_ssq_umi[group1, :].sum(axis=0) / group1_n_cells -
                      group1_mean**2)
        group2_var = (cluster_ssq_umi[group2, :].sum(axis=0) / group2_n_cells -
                      group2_mean**2)

        pooled_sd = np.sqrt(group1_var / group1_n_cells +
                            group2_var / group2_n_cells)

        z_scores = np.zeros_like(pooled_sd)
        nz = pooled_sd > 0
        z_scores[nz] = np.nan_to_num(mean_diff[nz] / pooled_sd[nz])

        # t-test
        p_vals = np.clip(
            (1 - stats.norm.cdf(np.abs(z_scores))) * 2 * z_scores.shape[0], 0,
            1)

        df = pd.DataFrame(OrderedDict([('z', z_scores), ('p', p_vals),
                                       ('group1', group1_mean),
                                       ('group2', group2_mean)]),
                          index=cluster_expression_df.index)

        df = df[df['p'] < 0.001]
        df['diff'] = df['group1'] - df['group2']

        df.sort_values('diff', ascending=False, inplace=True)

        name = f'differential_gene_expression_{lbl_1}_v_{lbl_2}'

        df.to_csv(file_format.format(name, 'csv'))

    for i in range(0, 2 * n_clusters - 1):
        if i >= n_clusters:
            left_child = rd[i].get_left()
            left_clusters = (left_child.pre_order(lambda x: x.id))

            right_child = rd[i].get_right()
            right_clusters = (right_child.pre_order(lambda x: x.id))

            # don't calculate if it's redundant with a 1-vs-all comp
            if i == 2 * n_clusters - 2 and (len(left_clusters) == 1
                                            or len(right_clusters) == 1):
                continue

            de(left_child.id, right_child.id, left_clusters, right_clusters)

        if i < 2 * n_clusters - 2:
            below = rd[i].pre_order(lambda x: x.id)
            above = [j for j in range(len(cluster_sizes)) if j not in below]

            # don't calculate redundant comparison
            if len(above) == 1:
                continue

            de(i, 'all', below, above)

    group_list = [(i, rd[i].pre_order(lambda x: x.id))
                  for i in range(0, 2 * n_clusters - 1)]
    group_list[-1] = ('total', group_list[-1][1])

    return group_list

Ejemplo n.º 33

0

Mostrar archivo

Archivo: test_performance.py Proyecto: simonruske/similarity_indices

  def test_adjusted_rand_performance(self):

    # Arrange
    n = 100
    np.random.seed(seed = 8455624)
    x = np.random.normal(n, 2, (n, 2))
    A = linkage(x, 'centroid')
    B = linkage(x, 'ward')
    
    # Act
    
    similarity_times = []
    sklearn_times = []
    fcluster_times = []
    
    for repitition in range(100):
    
      start = perf_counter()
    
      metrics = similarity_metrics(A, B)
      ar_similarity = metrics.adjusted_rand()
      
      end = perf_counter()
      
      similarity_times.append(end-start)
      
      ar_sklearn = []
      
      sklearn_time = 0
      fcluster_time = 0
      
      excluded_results = 0
      for i in range(n - 1, 1, -1):
      
        start = perf_counter()
        
        fcluster_a = fcluster(A, i, 'maxclust')
        fcluster_b = fcluster(B, i, 'maxclust')
        
        end = perf_counter()
        
        fcluster_time += (end - start)
        
        start = perf_counter()
        
        ar = adjusted_rand_score(fcluster_a, fcluster_b)
        
        end = perf_counter()
        
        sklearn_time += (end - start)
        
        # fcluster takes maxclust rather than an exact number of clusters 
        # most of the time it will create exactly maxclust, but for the occassions 
        # that it doesn't the results are are not comparable so ignore them
        if (len(np.unique(fcluster_a)) != i) or (len(np.unique(fcluster_b)) != i):
          excluded_results += 1
          ar_sklearn.append(ar_similarity[len(ar_sklearn)])
          
        else:
          ar_sklearn.append(ar)
       
      sklearn_times.append(sklearn_time)
      fcluster_times.append(fcluster_time)
      
      ar_sklearn = np.array(ar_sklearn)
       
      idx = ar_sklearn != np.nan
      
      # Assert
      self.assertEqual(len(ar_sklearn), len(ar_similarity))
      assert_almost_equal(ar_similarity, ar_sklearn)
      self.assertEqual(4, excluded_results) # double-check that we haven't excluded everything
    
    print("\nSimilarity average time: ", np.average(similarity_times))
    print("\nSklearn average time: ", np.average(sklearn_times))
    print("\nFCluster average time: ", np.average(fcluster_times))

Ejemplo n.º 34

0

Mostrar archivo

    plt.title('Image show of the correlation lattice - longitudinal')

    plt.figure()
    plt.imshow(dfC[0, 1::2, :])
    plt.colorbar()
    plt.title('Image show of the correlation lattice - lattitudinal')
    
#    plt.figure()
#    plt.imshow(y)
#    plt.title('Distance matrix')

#    render_component_single(gfC.d[0, :, :], gfC.lats, gfC.lons, False, None, "Neighbor correlation")

    print("Clustering ...")
    plt.figure()
    Z = fastcluster.linkage(ytri, method = 'single')
    print("Plotting dendrogram ...")
    dendrogram(Z, 7, 'level')
    
    max_d = np.amax(Z[:,2])
    print("Maximum distance is %g" % max_d)
    my_d = max_d / 2
    cont = True
    while cont:
        f = fcluster(Z, my_d, 'distance')
        print f.shape, my_d
        if np.amax(f) > 30:
            my_d = (max_d + my_d) * 0.5
        elif np.amax(f) < 10:
            my_d = my_d - (max_d - my_d) / max_d
        else:

Ejemplo n.º 35

0

Mostrar archivo

Archivo: plots.py Proyecto: aheilbut/alib

def clusterHeatmap(df,
                   title,
                   row_label_map,
                   col_label_map,
                   colormap=my_cmap,
                   cluster_rows=False,
                   cluster_columns=False,
                   cluster_data=None,
                   row_dendrogram=False,
                   column_dendrogram=False,
                   width=30,
                   height=20,
                   vmin=-3,
                   vmax=3,
                   distmethod="correlation",
                   colorbar=True,
                   colorbar_shrink=0.2,
                   label_values=False):

    cm = pylab.get_cmap(colormap)
    cm.set_bad("0.9")

    # do clustering
    if cluster_data is None:
        cluster_data = df  # cluster the same data that we are plotting

    matplotlib.rcParams['figure.figsize'] = [width, height]
    #    pylab.figsize(20, 10)
    pylab.title(title)
    #    pylab.text(0,-5,str(datetime.date.today()))

    # ylabels = [genesym[geneid] for geneid in pt.axes[0][Z['leaves']]]
    #  xlabels = pt.axes[1][cZ['leaves']]

    orderedVal = df

    if cluster_rows:
        distances = scipy.cluster.hierarchy.distance.pdist(
            cluster_data.values, distmethod)
        rowY = fastcluster.linkage(distances)
        rowZ = scipy.cluster.hierarchy.dendrogram(rowY,
                                                  orientation='right',
                                                  no_plot=True)
        orderedVal = df.reindex(index=df.axes[0][rowZ['leaves']])

    if cluster_columns:
        coldist = scipy.cluster.hierarchy.distance.pdist(
            df.values.transpose(), distmethod)
        cY = scipy.cluster.hierarchy.linkage(coldist)
        cZ = scipy.cluster.hierarchy.dendrogram(cY, no_plot=True)
        orderedVal = orderedVal.reindex(columns=df.axes[1][cZ['leaves']])

    # row labels
    if row_label_map is not None:
        pylab.yticks(range(0, len(orderedVal.index)),
                     [row_label_map[i] for i in orderedVal.index])
    else:
        pylab.yticks(range(0, len(orderedVal.index)), orderedVal.index)
    pylab.xticks(range(0, len(orderedVal.columns)),
                 orderedVal.columns,
                 rotation=90)

    if col_label_map is not None:
        pylab.xticks(range(0, len(orderedVal.columns)),
                     [col_label_map[i] for i in orderedVal.columns])

    if label_values:
        cmatrix = orderedVal.as_matrix()
        for x in range(cmatrix.shape[0]):
            for y in range(cmatrix.shape[1]):
                if cmatrix[x, y] >= 0:
                    pylab.text(y,
                               x,
                               "%.1f" % cmatrix[x, y],
                               horizontalalignment='center',
                               verticalalignment='center')

    #orderedVal = orderedVal[:,]
    pylab.tick_params(direction="out")
    pylab.imshow(orderedVal,
                 interpolation="nearest",
                 cmap=cm,
                 aspect='auto',
                 norm=None,
                 vmin=vmin,
                 vmax=vmax)
    if colorbar:
        pylab.colorbar(shrink=colorbar_shrink)

Ejemplo n.º 36

0

Mostrar archivo

Archivo: cnmf.py Proyecto: dylkot/SC-Ebola

    def consensus(self,
                  k,
                  density_threshold_str='0.5',
                  local_neighborhood_size=0.30,
                  show_clustering=False,
                  skip_density_and_return_after_stats=False,
                  close_clustergram_fig=True):
        merged_spectra = load_df_from_npz(self.paths['merged_spectra'] % k)
        norm_counts = sc.read(self.paths['normalized_counts'])

        if skip_density_and_return_after_stats:
            density_threshold_str = '2'
        density_threshold_repl = density_threshold_str.replace('.', '_')
        density_threshold = float(density_threshold_str)
        n_neighbors = int(local_neighborhood_size * merged_spectra.shape[0] /
                          k)

        # Rescale topics such to length of 1.
        l2_spectra = (merged_spectra.T / np.sqrt(
            (merged_spectra**2).sum(axis=1))).T

        if not skip_density_and_return_after_stats:
            # Compute the local density matrix (if not previously cached)
            topics_dist = None
            if os.path.isfile(self.paths['local_density_cache'] % k):
                local_density = load_df_from_npz(
                    self.paths['local_density_cache'] % k)
            else:
                #   first find the full distance matrix
                topics_dist = squareform(fast_euclidean(l2_spectra.values))
                #   partition based on the first n neighbors
                partitioning_order = np.argpartition(topics_dist, n_neighbors +
                                                     1)[:, :n_neighbors + 1]
                #   find the mean over those n_neighbors (excluding self, which has a distance of 0)
                distance_to_nearest_neighbors = topics_dist[
                    np.arange(topics_dist.shape[0])[:,
                                                    None], partitioning_order]
                local_density = pd.DataFrame(
                    distance_to_nearest_neighbors.sum(1) / (n_neighbors),
                    columns=['local_density'],
                    index=l2_spectra.index)
                save_df_to_npz(local_density,
                               self.paths['local_density_cache'] % k)
                del (partitioning_order)
                del (distance_to_nearest_neighbors)

            density_filter = local_density.iloc[:, 0] < density_threshold
            l2_spectra = l2_spectra.loc[density_filter, :]

        kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=1)
        kmeans_model.fit(l2_spectra)
        kmeans_cluster_labels = pd.Series(kmeans_model.labels_ + 1,
                                          index=l2_spectra.index)

        # Find median usage for each gene across cluster
        median_spectra = l2_spectra.groupby(kmeans_cluster_labels).median()

        # Normalize median spectra to probability distributions.
        median_spectra = (median_spectra.T / median_spectra.sum(1)).T

        # Compute the silhouette score
        stability = silhouette_score(l2_spectra.values,
                                     kmeans_cluster_labels,
                                     metric='euclidean')

        # Obtain the reconstructed count matrix by re-fitting the usage matrix and computing the dot product: usage.dot(spectra)
        refit_nmf_kwargs = yaml.load(open(self.paths['nmf_run_parameters']),
                                     Loader=yaml.FullLoader)
        refit_nmf_kwargs.update(
            dict(n_components=k, H=median_spectra.values, update_H=False))

        _, rf_usages = self._nmf(norm_counts.X, nmf_kwargs=refit_nmf_kwargs)
        rf_usages = pd.DataFrame(rf_usages,
                                 index=norm_counts.obs.index,
                                 columns=median_spectra.index)
        rf_pred_norm_counts = rf_usages.dot(median_spectra)

        # Compute prediction error as a frobenius norm
        if sp.issparse(norm_counts.X):
            prediction_error = ((norm_counts.X.todense() -
                                 rf_pred_norm_counts)**2).sum().sum()
        else:
            prediction_error = ((norm_counts.X -
                                 rf_pred_norm_counts)**2).sum().sum()

        consensus_stats = pd.DataFrame(
            [k, density_threshold, stability, prediction_error],
            index=[
                'k', 'local_density_threshold', 'stability', 'prediction_error'
            ],
            columns=['stats'])

        if skip_density_and_return_after_stats:
            return consensus_stats

        save_df_to_npz(
            median_spectra,
            self.paths['consensus_spectra'] % (k, density_threshold_repl))
        save_df_to_npz(
            rf_usages,
            self.paths['consensus_usages'] % (k, density_threshold_repl))
        save_df_to_npz(
            consensus_stats,
            self.paths['consensus_stats'] % (k, density_threshold_repl))
        save_df_to_text(
            median_spectra,
            self.paths['consensus_spectra__txt'] % (k, density_threshold_repl))
        save_df_to_text(
            rf_usages,
            self.paths['consensus_usages__txt'] % (k, density_threshold_repl))

        # Compute gene-scores for each GEP by regressing usage on Z-scores of TPM
        tpm = sc.read(self.paths['tpm'])
        tpm_stats = load_df_from_npz(self.paths['tpm_stats'])

        if sp.issparse(tpm.X):
            norm_tpm = (np.array(tpm.X.todense()) -
                        tpm_stats['__mean'].values) / tpm_stats['__std'].values
        else:
            norm_tpm = (tpm.X -
                        tpm_stats['__mean'].values) / tpm_stats['__std'].values

        usage_coef = fast_ols_all_cols(rf_usages.values, norm_tpm)
        usage_coef = pd.DataFrame(usage_coef,
                                  index=rf_usages.columns,
                                  columns=tpm.var.index)

        save_df_to_npz(
            usage_coef,
            self.paths['gene_spectra_score'] % (k, density_threshold_repl))
        save_df_to_text(
            usage_coef, self.paths['gene_spectra_score__txt'] %
            (k, density_threshold_repl))

        # Convert spectra to TPM units, and obtain results for all genes by running last step of NMF
        # with usages fixed and TPM as the input matrix
        norm_usages = rf_usages.div(rf_usages.sum(axis=1), axis=0)
        refit_nmf_kwargs.update(dict(H=norm_usages.T.values, ))

        _, spectra_tpm = self._nmf(tpm.X.T, nmf_kwargs=refit_nmf_kwargs)
        spectra_tpm = pd.DataFrame(spectra_tpm.T,
                                   index=rf_usages.columns,
                                   columns=tpm.var.index)
        save_df_to_npz(
            spectra_tpm,
            self.paths['gene_spectra_tpm'] % (k, density_threshold_repl))
        save_df_to_text(
            spectra_tpm,
            self.paths['gene_spectra_tpm__txt'] % (k, density_threshold_repl))

        if show_clustering:
            if topics_dist is None:
                topics_dist = squareform(fast_euclidean(l2_spectra.values))
                # (l2_spectra was already filtered using the density filter)
            else:
                # (but the previously computed topics_dist was not!)
                topics_dist = topics_dist[
                    density_filter.values, :][:, density_filter.values]

            spectra_order = []
            for cl in sorted(set(kmeans_cluster_labels)):

                cl_filter = kmeans_cluster_labels == cl

                if cl_filter.sum() > 1:
                    cl_dist = squareform(topics_dist[cl_filter, :][:,
                                                                   cl_filter])
                    cl_dist[
                        cl_dist <
                        0] = 0  #Rarely get floating point arithmetic issues
                    cl_link = linkage(cl_dist, 'average')
                    cl_leaves_order = leaves_list(cl_link)

                    spectra_order += list(
                        np.where(cl_filter)[0][cl_leaves_order])
                else:
                    ## Corner case where a component only has one element
                    spectra_order += list(np.where(cl_filter)[0])

            from matplotlib import gridspec
            import matplotlib.pyplot as plt

            width_ratios = [0.5, 9, 0.5, 4, 1]
            height_ratios = [0.5, 9]
            fig = plt.figure(figsize=(sum(width_ratios), sum(height_ratios)))
            gs = gridspec.GridSpec(len(height_ratios),
                                   len(width_ratios),
                                   fig,
                                   0.01,
                                   0.01,
                                   0.98,
                                   0.98,
                                   height_ratios=height_ratios,
                                   width_ratios=width_ratios,
                                   wspace=0,
                                   hspace=0)

            dist_ax = fig.add_subplot(gs[1, 1],
                                      xscale='linear',
                                      yscale='linear',
                                      xticks=[],
                                      yticks=[],
                                      xlabel='',
                                      ylabel='',
                                      frameon=True)

            D = topics_dist[spectra_order, :][:, spectra_order]
            dist_im = dist_ax.imshow(D,
                                     interpolation='none',
                                     cmap='viridis',
                                     aspect='auto',
                                     rasterized=True)

            left_ax = fig.add_subplot(gs[1, 0],
                                      xscale='linear',
                                      yscale='linear',
                                      xticks=[],
                                      yticks=[],
                                      xlabel='',
                                      ylabel='',
                                      frameon=True)
            left_ax.imshow(kmeans_cluster_labels.values[spectra_order].reshape(
                -1, 1),
                           interpolation='none',
                           cmap='Spectral',
                           aspect='auto',
                           rasterized=True)

            top_ax = fig.add_subplot(gs[0, 1],
                                     xscale='linear',
                                     yscale='linear',
                                     xticks=[],
                                     yticks=[],
                                     xlabel='',
                                     ylabel='',
                                     frameon=True)
            top_ax.imshow(kmeans_cluster_labels.values[spectra_order].reshape(
                1, -1),
                          interpolation='none',
                          cmap='Spectral',
                          aspect='auto',
                          rasterized=True)

            hist_gs = gridspec.GridSpecFromSubplotSpec(3,
                                                       1,
                                                       subplot_spec=gs[1, 3],
                                                       wspace=0,
                                                       hspace=0)

            hist_ax = fig.add_subplot(hist_gs[0, 0],
                                      xscale='linear',
                                      yscale='linear',
                                      xlabel='',
                                      ylabel='',
                                      frameon=True,
                                      title='Local density histogram')
            hist_ax.hist(local_density.values, bins=np.linspace(0, 1, 50))
            hist_ax.yaxis.tick_right()

            xlim = hist_ax.get_xlim()
            ylim = hist_ax.get_ylim()
            if density_threshold < xlim[1]:
                hist_ax.axvline(density_threshold, linestyle='--', color='k')
                hist_ax.text(density_threshold + 0.02,
                             ylim[1] * 0.95,
                             'filtering\nthreshold\n\n',
                             va='top')
            hist_ax.set_xlim(xlim)
            hist_ax.set_xlabel(
                'Mean distance to k nearest neighbors\n\n%d/%d (%.0f%%) spectra above threshold\nwere removed prior to clustering'
                % (sum(~density_filter), len(density_filter), 100 *
                   (~density_filter).mean()))

            fig.savefig(self.paths['clustering_plot'] %
                        (k, density_threshold_repl),
                        dpi=250)
            if close_clustergram_fig:
                plt.close(fig)

Ejemplo n.º 37

0

Mostrar archivo

def generate_heatmap(
    dataframe: pd.DataFrame,
    color_dataframe: pd.DataFrame,
    histo_dataframe: pd.DataFrame,
    color_map: dict = None,
    cluster: bool = True,
    figsize: tuple = (10, 15)) -> plt.Figure:

    fig = plt.figure(figsize=figsize)

    if cluster:
        linkage = fastcluster.linkage(dataframe.T,
                                      "complete",
                                      metric="correlation",
                                      preserve_input=True)
        dendrogram_row_ratio = 2
    else:
        linkage = None
        # Make row smaller without dendrogram
        dendrogram_row_ratio = 0.1

    max_rows = 4 if color_map is not None else 3

    # The bar plot is actually on the third row
    # FIXME: Handle long labels (ratios)

    if color_map is not None:
        bar_ratio = 0.25 * len(color_map)
        height_ratios = [dendrogram_row_ratio, 0.25, bar_ratio, 15]
    else:
        height_ratios = [dendrogram_row_ratio, 0.25, 15]

    gs = grid.GridSpec(max_rows,
                       2,
                       height_ratios=height_ratios,
                       width_ratios=[0.2, 15])

    dendro_ax = fig.add_subplot(gs[0, 1], axisbg="white")  # Dendrogram

    plt.setp(dendro_ax.get_yticklabels(), visible=False)

    pathway_ax = fig.add_subplot(gs[-1, 0])  # Pathway
    heatmap_ax = fig.add_subplot(gs[-1, 1], sharey=pathway_ax)  # Heatmap

    # Con gridspec e' necessario fare questo in modo che gli assi Y
    # non siano visibili

    plt.setp(heatmap_ax.get_yticklabels(), visible=False)

    if linkage is not None:
        leaf_ax = fig.add_subplot(gs[1, 1], sharex=dendro_ax)
        dendro = sch.dendrogram(
            linkage,
            ax=dendro_ax,
            no_labels=False,
            labels=dataframe.columns,
            leaf_rotation=90,
        )
        # Reorder dataframe according to the labels in the leaves
        dataframe = dataframe[dendro["ivl"]]  # Leaf node labels

        # Put labels in the right order!
        histo_dataframe = histo_dataframe.loc[dendro["ivl"]]

        # TRICK: Given that printing labels screws layout because they add an
        # x axis, we generate a specific axis only with the text, iterating on
        # the locations of the labels of the dendrogram. After the new text is
        # in place, we remove the labels from the dendrogram.

        for leafname, leafcoord in zip(dendro["ivl"],
                                       dendro_ax.xaxis.get_ticklocs()):
            leaf_ax.text(leafcoord,
                         0.99,
                         leafname,
                         rotation=90,
                         horizontalalignment="center")

    else:
        set_axis_parameters(heatmap_ax, dataframe, False)
        leaf_ax = fig.add_subplot(gs[1, 1], sharex=heatmap_ax)
        dataframe = dataframe.loc[:, histo_dataframe.index]

        set_axis_parameters(leaf_ax, dataframe, False)

        for index, leafcoord in enumerate(leaf_ax.xaxis.get_ticklocs()):
            leaf_ax.text(leafcoord,
                         0.99,
                         dataframe.columns[index],
                         rotation=90,
                         horizontalalignment="center")

    clean_axis(leaf_ax)
    leaf_ax.grid(False)
    leaf_ax.axis('off')
    clean_axis(dendro_ax)

    if color_map is not None:

        subgrids = len(color_map)
        gs_inside = grid.GridSpecFromSubplotSpec(
            subgrids,
            1,
            subplot_spec=gs[2, 1],
            height_ratios=[1 for item in color_map])

        bars = list()
        for index, group in enumerate(sorted(color_map)):

            column = color_map[group]
            bar_ax = fig.add_subplot(gs_inside[index])
            clean_axis(bar_ax)
            create_colorbar(histo_dataframe,
                            bar_ax,
                            False,
                            column,
                            labels=False)
            bar_ax.text(-0.25,
                        0.5,
                        group,
                        horizontalalignment="right",
                        verticalalignment="center")
            bars.append(bar_ax)

    create_colorbar(color_dataframe, pathway_ax)

    cmap, norm = create_colormap()

    dataframe = dataframe.loc[color_dataframe.index]

    heatmap1 = heatmap_ax.pcolor(dataframe,
                                 cmap=cmap,
                                 edgecolors="black",
                                 alpha=1,
                                 norm=norm)

    set_axis_parameters(heatmap_ax, dataframe, False)

    cax = fig.add_axes([-0.05, 1.025, 0.15, 0.025])

    cbar = fig.colorbar(heatmap1,
                        cax=cax,
                        orientation="horizontal",
                        ticks=range(9))

    cbar.solids.set_edgecolor("face")

    gs.tight_layout(fig)

    return fig, cax

Ejemplo n.º 38

0

Mostrar archivo

Archivo: test_clusterval.py Proyecto: Nuno09/clusterval

def test_dendrogram_plotting():
    c.Z = linkage(iris['data'], c.algorithm)
    labels = np.random.rand(1, iris['data'].shape[0])[0]
    c.dendrogram = c._calculate_dendrogram(labels)
    assert len(labels) == len(c.dendrogram['ivl'])

Ejemplo n.º 39

0

Mostrar archivo

					try:
						boosted_wdfVoc[k] = wdfVoc[k] * boost_entity[k]
					except:
						boosted_wdfVoc[k] = wdfVoc[k]

				print "sorted wdfVoc*boost_entity:"
				print sorted( ((v,k) for k,v in boosted_wdfVoc.iteritems()), reverse=True)
				'''
            #Hclust: fast hierarchical clustering with fastcluster
            #X is samples by features
            #distMatrix is sample by samples distances
            distMatrix = pairwise_distances(X_normalized, metric='cosine')

            #cluster tweets
            print "fastcluster, average, cosine"
            L = fastcluster.linkage(distMatrix, method='average')

            #for dt in [0.3, 0.4, 0.5, 0.6, 0.7]:
            #for dt in [0.5]:
            dt = 0.5
            print "hclust cut threshold:", dt
            #				indL = sch.fcluster(L, dt, 'distance')
            indL = sch.fcluster(L, dt * distMatrix.max(), 'distance')
            #print "indL:", indL
            freqTwCl = Counter(indL)
            print "n_clusters:", len(freqTwCl)
            print(freqTwCl)
            #				print "silhoutte: ", metrics.silhouette_score(distMatrix, indL, metric="precomputed")
            allowSiloutte = False
            for freqTwClkey, freqTwClCount in freqTwCl.iteritems():
                if (freqTwClCount > 1):

Ejemplo n.º 40

0

Mostrar archivo

def cluster_array_to_k_groups(R, k):
    Z=fastcluster.linkage(R, method='average', metric='euclidean', preserve_input=True)
    import tree
    tr=tree.Tree(Z=Z)
    X=tr.representatives(n_picks=k, l_keep_members=True)
    return X

Ejemplo n.º 41

0

Mostrar archivo

        img_ds.RasterCount),  # number of bands
    gdal_array.GDALTypeCodeToNumericTypeCode(
        img_ds.GetRasterBand(1).DataType))  # data type code
#print img.shape # warning: that assumed that the raster bands were all the same type (should be true)

# reshape the image band by band
for b in range(img.shape[2]):
    img[:, :, b] = img_ds.GetRasterBand(b + 1).ReadAsArray()

# reshape image again to match expected format for scikit-learn
new_shape = (img.shape[0] * img.shape[1], img.shape[2])
X = img[:, :, :img.shape[2]].reshape(new_shape)

# use fastcluster.linkage instead of scipy.cluster.hierarchy.linkage
print "calculating linkage.."
Z = fc.linkage(X, 'average')  # https://en.wikipedia.org/wiki/UPGMA

print "calculating dendrogram.."
fig = plt.figure(figsize=(10, 10))  # 25, 10
plt.title('hierarchical clustering dendrogram')
rotate = False

plt.ylabel('distance' if (not rotate) else 'index')
plt.xlabel('index' if (not rotate) else 'distance')
dn = dendrogram(
    Z,
    #truncate_mode='lastp',
    #p = n_clusters,
    leaf_rotation=0. if rotate else 90.,
    show_contracted=True,
    orientation='right' if rotate else 'top',

Ejemplo n.º 42

0

Mostrar archivo

 def cluster(self, method='average', metric='euclidean', l_row=True, l_col=True):
     if l_row:
         self.Zr=fastcluster.linkage(self.data, method='average', metric='euclidean', preserve_input=True)
         #left_dendrogram=clst.dendrogram(Zr, orientation='left')
     if l_col:
         self.Zc=fastcluster.linkage(self.data.T, method='average', metric='euclidean', preserve_input=True)

Ejemplo n.º 43

0

Mostrar archivo

def prog_linkage(X, n_cluster):
    hclust = linkage(X, method='single')
    labels = fcluster(hclust, t=n_cluster, criterion="maxclust")
    return labels - 1

Ejemplo n.º 44

0

Mostrar archivo

Archivo: trajactory_clustering.py Proyecto: 15101538237ren/Cell_Fate_Trajactory_Clustering

    pickle_fp = os.path.join(PICKLE_DATA, "%s.pkl" % run_name)
    [pair_indexs, data_dict, traj_list, gene_pair_names
     ] = convert_data_into_np_array(stage_data_dir_name,
                                    INDEX_RANGE,
                                    pickle_fp,
                                    load=False,
                                    OFF_SET=OFF_SET,
                                    include_sox_and_t=include_sox_and_t,
                                    Filter=False)

    metric = "directed_hausdorff_plus_pair_wise_euclidean"  # pair_wise_euclidean_distance
    distance_fp = os.path.join(NPY_DATA, "%s_%s.npy" % (run_name, metric))
    calc_distance_matrix(distance_fp, traj_list)
    cm = plt.get_cmap('gist_rainbow')
    p_dist = np.load(distance_fp)
    Z = fc.linkage(p_dist, method="ward")
    distance_threshold = 800 if log_transformed else 10
    labels = fcluster(Z, t=distance_threshold, criterion="distance") - 1
    [
        passed_traj_list, passed_labels, passed_pair_indexs,
        passed_gene_pair_names
    ] = filter_cluster(traj_list, labels, pair_indexs, gene_pair_names)
    CLUSTER_PLOT_CMAP = "gist_rainbow"
    passed_labels = plot_cluster(passed_traj_list,
                                 passed_labels,
                                 run_name,
                                 FIGURE_FORMAT,
                                 color_palette=None,
                                 log_transformed=log_transformed,
                                 cmap=CLUSTER_PLOT_CMAP)

Ejemplo n.º 45

0

Mostrar archivo

Archivo: cospec.py Proyecto: essigmannlab/8OG_mutagenesis

def uhc_cluster(cosmic_list, ref_sig):
    spectra = [list(ref_sig.values())]  # so ref signature is value 0
    for sig in cosmic_list:
        spectra.append(list(cosmic_list[sig].values()))
    return linkage(spectra, method='ward', metric='cosine')

Ejemplo n.º 46

0

Mostrar archivo

Archivo: clustering.py Proyecto: awesome-security/malcluster

def hacluster(y):
    """ Wrapper for the Hierarchical Clustering algorithm from fastcluster """
    z = fastcluster.linkage(y, method='single')
    return z

Ejemplo n.º 47

0

Mostrar archivo

def validation(M,df_encoded,results,Z,method,min_K,max_K,automatic=None,pp=None,gap=None,Tp=None):
    ##############################################################################
    # HOW MANY CLUSTERS?
    ###############################################################################
    # bootstrap method - sampling without replacement

    #dictionary to store all computed indexes for each number of clusters K=min_K,...max_K
    nn_history = defaultdict(dict)
    trees = defaultdict(dict)
    dicio_statistics = {k:{} for k in range(min_K,max_K)}

    for k in range(min_K,max_K):
        for index in indexes:
            dicio_statistics[k][index] = []

        c_assignments_original = cut_tree(Z, k)
        # list of clusters for the clustering result with the original data
        partition_original = cluster_indices(c_assignments_original, df_encoded.index.tolist())
        trees[k] = partition_original


    #for each bootstrap sample
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded), int((3/4)*len(df_encoded)), replace = False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(itertools.combinations(df_encoded.loc[idx,'id_patient'],2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,columns = ['patient1','patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1','patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'],method)

        #for each number of clusters k=min_K,...,max_K
        for k, partition in trees.items():

            c_assignments_bootstrap = cut_tree(Z_bootstrap,k)
            #list of clusters for the clustering result with the bootstrap sample
            partition_bootstrap = cluster_indices(c_assignments_bootstrap,idx)
            #compute 4 different cluster external indexes between the partitions
            #computed_indexes = cluster_external_index(partition,partition_bootstrap)
            computed_indexes = clustereval.calculate_external(partition, partition_bootstrap)



            #print(computed_indexes)
            for pos, index in enumerate(external_indexes):
                dicio_statistics[k][index].append(computed_indexes[pos])

    for k, partition in trees.items():
        calc_idx = clustereval.calculate_internal(results[['patient1', 'patient2', 'score']], partition, k, trees[max_K - 1])
        for index in internal_indexes:
            dicio_statistics[k][index].append(calc_idx[index])
    ###########################################################################
    #  DECISION ON THE NUMBER OF CLUSTERS
    # The correct number of clusters is the k that yield most maximum average values of
    # clustering indices.
    # Also the k found before needs to have a low value of standard deviation - it has to
    # be the minimum between all k's or a value that is somehow still low compared to others
    ###########################################################################

    #dataframe that stores the clustering indices averages for each k
    col = indexes.copy()
    col.extend(['k', 'k_score_avg'])
    df_avgs = pd.DataFrame(index = range(min_K,max_K),columns = col, dtype='float')
    #dataframe that stores the AR and AW indices standard deviations for each k
    df_stds = pd.DataFrame(index = range(min_K,max_K),columns = col, dtype = 'float')

    #computing the means and standard deviations
    for k in range(min_K,max_K):
        df_avgs.loc[k]['k'] = k
        df_stds.loc[k]['k'] = k
        for index in indexes:
            if index not in internal_indexes:
                df_avgs.loc[k][index] = mean(dicio_statistics[k][index])
                df_stds.loc[k][index] = stdev(dicio_statistics[k][index])
            else:
                df_avgs.loc[k][index] = dicio_statistics[k][index][0]
                df_stds.loc[k][index] = dicio_statistics[k][index][0]

        df_avgs.loc[k]['k_score_avg'] = 0
        df_stds.loc[k]['k_score_std'] = 0

        #df_stds.loc[k]['k_score_std_2'] = 0

    #weights given to each clustering indice, Rand Index does not value as much as the other indices
    weights = {index: 1/len(indexes) for index in indexes}
    #found the maximum value for each clustering index and locate in which k it happens
    # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices
    columns = df_avgs.columns
    analyzed_columns = columns[2:-3]
    for column in analyzed_columns:

        if column in min_indexes:
            idx_min = df_avgs[column].idxmin()
            df_avgs.loc[idx_min]['k_score_avg'] = df_avgs.loc[idx_min]['k_score_avg'] + weights[column]
            continue


        idx_max = df_avgs[column].idxmax()
        df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max]['k_score_avg'] + weights[column]

    #idx_min_s_dbw = df_avgs['s_dbw'].idxmin()
    #idx_min_cvnn = df_avgs['cvnn'].idxmin()
    #df_avgs.loc[idx_min_s_dbw]['k_score_avg'] = df_avgs.loc[idx_min_s_dbw]['k_score_avg'] + weights['s_dbw']
    #df_avgs.loc[idx_min_cvnn]['k_score_avg'] = df_avgs.loc[idx_min_cvnn]['k_score_avg'] + weights['cvnn']

    #final number of clusters chosen by analysing df_avgs
    final_k = df_avgs['k_score_avg'].idxmax()


    if(automatic==0 or automatic==1):

        fig1 = plt.figure(figsize=(10,5))
        ax = plt.gca()
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)
        ax.axis('tight')
        ax.axis('off')
        #colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns
        colLabels1 = external_indexes.copy()
        colLabels1.append('k')
        cell_text1 = []
        for row in range(len(df_avgs)):
            cell_text1.append(df_avgs.iloc[row,list(range(len(external_indexes))) + [-2]].round(decimals=3))
        plt.title('Average values of eleven external indices \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method))
        the_table = plt.table(cellText=cell_text1, colLabels=colLabels1, loc='center',cellLoc='center')
        #the_table.auto_set_font_size(False)
        #the_table.set_fontsize(4)
        fig1.text(0.1, 0.01, "R = Rand, AR = Adjusted Rand, FM = Fowlkes and Mallows, J = Jaccard, AW = Adjusted Wallace, "
                      "VD = Van Dongen, H = Huberts, H' = Huberts Normalized, F = F-Measure, "
                      "VI = Variation of information, MS = Minkowski", fontsize=5)
        pp.savefig(fig1)



        fig2 = plt.figure(3, figsize=(10, 5))
        ax = plt.gca()
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)
        ax.axis('tight')
        ax.axis('off')
        # colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns
        colLabels2 = internal_indexes.copy()
        colLabels2.append('k')
        cell_text2 = []
        for row in range(len(df_avgs)):
            cell_text2.append(df_avgs.iloc[row, list(range(len(external_indexes), len(indexes))) + [-2]].round(decimals=3))
        plt.title('Average values of six internal indices \n gap: %.2f, Tp: %.2f, %s link' % (gap, Tp, method))
        plt.table(cellText=cell_text2, colLabels=colLabels2, loc='center', cellLoc='center', fontsize=20)
        pp.savefig(fig2)


        #bar chart of standard deviation - standard deviation of all measures
        # Create a figure instance
    #    plt.figure(2)
    #    df_stds.loc[:,df_stds.columns != 'k'].plot.bar(figsize=(15,8))
    #    plt.title('Standard deviation of five measures versus number of clusters',fontsize=25)
    #    plt.xlabel('Number of clusters',labelpad=20,fontsize=20)
    #    plt.ylabel('Standard deviation',labelpad=10,fontsize=20)
    #    plt.xticks(size = 20)
    #    plt.yticks(size = 20)
    #    plt.show()


        fig3 = plt.figure(4)
        df_stds.loc[:,'AR'].plot.bar(figsize=(15,8),color='forestgreen')
        plt.title('Standard deviation of Adjusted Rand versus number of clusters \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method),fontsize=25)
        plt.xlabel('Number of clusters',labelpad=20,fontsize=15)
        plt.ylabel('Standard deviation',labelpad=10,fontsize=15)
        plt.xticks(size = 20)
        plt.yticks(size = 20)
        #plt.show()

        pp.savefig(fig3)


    return [df_avgs,df_stds,final_k]

Ejemplo n.º 48

0

Mostrar archivo

Archivo: _mapper.py Proyecto: AdrianVasquez1/CodesandOtherStuffs

 def __call__(self, X):
     return linkage(X, method='weighted')

Ejemplo n.º 49

0

Mostrar archivo

M[:, 1] = ys

scales = np.exp(np.linspace(np.log(Lmin), np.log(Lmax), Nscales))

del MASS, Observable

print('        Scale Loop        \n')

for scale in scales:
    print('        This scale : %f\t     \n' % scale)
    l = 1.1 * scale / float(ds.length_unit.in_units('pc'))
    ls = 1.1 * scale

    distance = sch.distance.pdist(
        M)  # vector of (100 choose 2) pairwise distances
    Link = fastcluster.linkage(distance, method='complete')
    ind = sch.fcluster(Link, l, 'distance')

    xcm = []
    ycm = []

    for j in set(ind):
        temp_mass = mass[ind == j].sum()
        if temp_mass > 10:
            xcm.append(np.average(xs[ind == j], weights=mass[ind == j]))
            ycm.append(np.average(ys[ind == j], weights=mass[ind == j]))

    del temp_mass

    print('        Number of regions :%d\t     \n' % len(xcm))

Ejemplo n.º 50

0

Mostrar archivo

Archivo: cluster_stability.py Proyecto: Nuno09/AliClu

def cluster_validation(M, method, k, partition_found, df_encoded, results):
    #dictionary to store all computed indexes for each cluster
    dicio_cluster_validation = {k: {} for k in range(1, k + 1)}
    for k in range(1, k + 1):
        dicio_cluster_validation[k]['jaccard'] = []
        dicio_cluster_validation[k]['dice'] = []
        dicio_cluster_validation[k]['asymmetric'] = []

    #assess cluster stability for K=k that was the number of clusters chosen
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded),
                               int((3 / 4) * len(df_encoded)),
                               replace=False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(
            itertools.combinations(df_encoded.loc[idx, 'id_patient'], 2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,
                                              columns=['patient1', 'patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results,
                                     patient_comb_bootstrap,
                                     how='inner',
                                     on=['patient1', 'patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'], method)

        c_assignments_bootstrap = cut_tree(Z_bootstrap, k)
        partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx)

        for k_i in range(1, k + 1):
            aux_jaccard = []
            aux_dice = []
            aux_asymmetric = []
            for i in range(1, k + 1):
                aux = cluster_validation_indexes(partition_found[k_i - 1],
                                                 partition_bootstrap[i - 1])
                aux_jaccard.append(aux[0])
                aux_dice.append(aux[2])
                aux_asymmetric.append(aux[1])

            dicio_cluster_validation[k_i]['jaccard'].append(max(aux_jaccard))
            dicio_cluster_validation[k_i]['dice'].append(max(aux_dice))
            dicio_cluster_validation[k_i]['asymmetric'].append(
                max(aux_asymmetric))

    #obtain the average cluster external indexes for each number of clusters
    jaccard_cluster_median = []
    dice_median = []
    asymmetric_median = []
    jaccard_cluster_avg = []
    dice_avg = []
    asymmetric_avg = []
    jaccard_cluster_std = []
    dice_std = []
    asymmetric_std = []
    table = []
    cluster_sizes = []

    for k in range(1, k + 1):
        jaccard_cluster_median.append(
            round(median(dicio_cluster_validation[k]['jaccard']), 3))
        dice_median.append(
            round(median(dicio_cluster_validation[k]['dice']), 3))
        asymmetric_median.append(
            round(median(dicio_cluster_validation[k]['asymmetric']), 3))
        jaccard_cluster_avg.append(
            round(mean(dicio_cluster_validation[k]['jaccard']), 3))
        dice_avg.append(round(mean(dicio_cluster_validation[k]['dice']), 3))
        asymmetric_avg.append(
            round(mean(dicio_cluster_validation[k]['asymmetric']), 3))
        jaccard_cluster_std.append(
            round(stdev(dicio_cluster_validation[k]['jaccard']), 3))
        dice_std.append(round(stdev(dicio_cluster_validation[k]['dice']), 3))
        asymmetric_std.append(
            round(stdev(dicio_cluster_validation[k]['asymmetric']), 3))

        cluster_sizes.append(len(partition_found[k - 1]))

        table.append([
            str(k) + ' (' + str(len(partition_found[k - 1])) + ')',
            jaccard_cluster_median[k - 1], dice_median[k - 1],
            asymmetric_median[k - 1], jaccard_cluster_avg[k - 1],
            dice_avg[k - 1], asymmetric_avg[k - 1], jaccard_cluster_std[k - 1],
            dice_std[k - 1], asymmetric_std[k - 1]
        ])

    headers = [
        'Cluster Number', 'J_median', 'D_median', 'A_median', 'J_avg', 'D_avg',
        'A_avg', 'J_std', 'D_std', 'A_std'
    ]
    print(tabulate(table, headers))

    cluster_stability = [
        jaccard_cluster_median, dice_median, asymmetric_median,
        jaccard_cluster_avg, dice_avg, asymmetric_avg, jaccard_cluster_std,
        dice_std, asymmetric_std, cluster_sizes
    ]

    return cluster_stability

Ejemplo n.º 51

0

Mostrar archivo

Archivo: ColorPalette52ImagesToPaletteExtraction.py Proyecto: Bluetenhonig/palettepen

def color_palette(frame_bgr,
                  mask=None,
                  mask_index=None,
                  n_merge_steps=100,
                  image_size=400.0,
                  seeds_model=None,
                  n_pixels=400,
                  n_merge_per_lvl=10,
                  mask_inverse=False,
                  normalization_lower_bound=100.0,
                  seeds_input_width=600,
                  use_lab=True,
                  show_seed=False,
                  seed_labels=None) -> PaletteAsset:
    """
    Computes a hierarchical color palette as generated by VIAN, does not keep the original tree.

    :param frame_bgr: A frame in bgr uint8, currently float32 is not allowed since OpenCV may crash on it
    :param mask: An optional mask of labels
    :param mask_index: The label which the palette should be computed on
    :param mask_inverse: If true, all but the given mask_index will be computed.
    :param n_merge_steps: Number of merge steps to return (approximately), this is restricted by the
    :param image_size: image size to compute on
    :param seeds_model: the seeds model can optionally be given as argument to avoid initialization after each image
    :param n_pixels: number of super pixels to compute (approximately)
    :param n_merge_per_lvl: After the first 10 merges, every nth depth to store in the result
    :param normalization_lower_bound: Minimal number of pixels to keep a cluster
    :param seeds_input_width: input for the seeds model
    :param use_lab: if false, RGB will used for average computation instead of lab
    :param show_seed: if true, the seeds output will be shown in opencv, make sure to put cv2.waitKey() to see the result
    :return: PaletteAsset
    """

    frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2LAB)

    if seeds_input_width < frame.shape[0]:
        rx = seeds_input_width / frame.shape[0]
        frame = cv2.resize(frame, None, None, rx, rx, cv2.INTER_CUBIC)

    if seed_labels is None:
        if seeds_model is None:
            seeds_model = PaletteExtractorModel(frame,
                                                n_pixels=n_pixels,
                                                num_levels=4)
        labels = seeds_model.forward(frame, 200).astype(np.uint8)
    else:
        labels = seed_labels

    if show_seed:
        cv2.imshow(
            "SEED",
            cv2.cvtColor(seeds_model.labels_to_avg_color_mask(frame, labels),
                         cv2.COLOR_LAB2BGR))

    fx = image_size / frame.shape[0]
    frame = cv2.resize(frame, None, None, fx, fx, cv2.INTER_CUBIC)
    labels = cv2.resize(labels, None, None, fx, fx, cv2.INTER_NEAREST)
    frame_bgr = cv2.resize(frame_bgr, None, None, fx, fx, cv2.INTER_CUBIC)

    if mask is not None:
        mask = cv2.resize(mask, (labels.shape[1], labels.shape[0]), None,
                          cv2.INTER_NEAREST)

        if mask_inverse:
            labels[np.where(mask == mask_index)] = 255
        else:
            labels[np.where(mask != mask_index)] = 255

        bins = np.unique(labels)
        bins = np.delete(bins, np.where(bins == 255))
    else:
        bins = np.unique(labels)

    data = []
    hist = np.histogram(labels, bins=bins)

    normalization_f = np.amin(hist[0])
    if normalization_f < normalization_lower_bound:
        normalization_f = normalization_lower_bound
    labels_list = []
    colors_list = []

    all_cols = []

    all_labels = []

    for i, bin in enumerate(hist[0]):
        if bin < normalization_f:
            continue
        lbl = hist[1][i]
        if use_lab:
            avg_color = np.round(
                cv2.cvtColor(
                    np.array(
                        [[np.mean(frame[np.where(labels == lbl)], axis=0)]],
                        dtype=np.uint8),
                    cv2.COLOR_LAB2BGR)[0, 0]).astype(np.uint8)
        else:
            avg_color = np.round(
                np.mean(frame_bgr[np.where(labels == lbl)],
                        axis=0)).astype(np.uint8)

        labels_list.append(lbl)
        colors_list.append(avg_color)

        data.extend([avg_color] * int(np.round(bin / normalization_f)) * 2)
        all_cols.extend([avg_color] * int(np.round(bin / normalization_f)) * 2)
        all_labels.extend([lbl] * int(np.round(bin / normalization_f)) * 2)

    data = np.array(data)

    Z = linkage(data, 'ward')
    tree, merge_dists = to_cluster_tree(Z, all_labels, all_cols, n_merge_steps,
                                        n_merge_per_lvl)
    return PaletteAsset(tree, merge_dists)

Ejemplo n.º 52

0

Mostrar archivo

        if len(corp)>4:
            wordCorps.append(corp)
#-----------------------------------------------------------------------------------------------------------------------------------------------
'''train and pick trained word vec'''
dirs = "C:\\Users\\Administrator.NBJXUEJUN-LI\\Desktop\\project\\Python\\NLP\\savedObject\\CompCorpus\\"
slm = pickle.load(open(dirs+"slm.pkl","rb"))
'''perform kmeans cluster without normalize'''
from sklearn.cluster import KMeans
TopicNums = 10
wordNums = slm.wordvec.shape[0]
kmeansFit = KMeans(n_clusters=TopicNums)
kmeansFit.fit(slm.wordvec)

'''perform hierachical cluster'''
import fastcluster 
result = fastcluster.linkage (X = slm.wordvec, method='single', metric='euclidean', preserve_input='False')
'''compute word depth'''
clustStruct = {}
for ridx in range(result.shape[0]):
    cidx = int(ridx+wordNums)
    clustStruct.setdefault(cidx,np.zeros(wordNums, dtype=np.int))
    for i in [0,1]:
        code = int(result[ridx][i])
        if code<wordNums:
            clustStruct[cidx][code]+=1
        else:
            clustStruct[cidx]+=(clustStruct[code]+ (clustStruct[code]!=0).astype('int'))
wordDepth = clustStruct[max(clustStruct.keys())]
#-----------------------------------------------------------------------------------------------------------------------------------------------
'''compute the word degree within sentence co-occurance cross docs'''
sentWindow = 3

Ejemplo n.º 53

0

Mostrar archivo

def cluster(dupes: numpy.ndarray,
            cluster_threshold: float = 0.5,
            max_components: int = 30000,
            id_to_match: str = None) -> Clusters:
    """
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    `https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.fcluster.html`



    Args:
        dupes: (np.array)[tuple(list[str], float)] A list of tuples, where each tuple
            contains an id pair and a probability that they are a match:
                id_pair_tuple: ([record_id_1, record_id_2], prob)
                dtype: np.dtype([('pairs', '<U256', 2),
                                 ('score', 'f4', 1)])
        threshold: (float) number betweent 0 and 1 (default is .5). lowering the
            number will increase precision, raising it will increase recall
    """
    distance_threshold = cluster_threshold
    score_threshold = 1 - cluster_threshold
    dupe_sub_graphs = connected_components(dupes, max_components)
    # logger.info(f"Dupes: {dupes}")
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:
            i_to_id, condensed_distances, N = condensed_distance(sub_graph)
            logger.debug(f"{condensed_distances}")
            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=True)
            partition = hcluster.fcluster(linkage,
                                          distance_threshold,
                                          criterion='distance')

            clusters: Dict[int, List[int]] = defaultdict(list)
            logger.debug(f"Partition: {partition}")
            logger.debug(f"Linkage: {linkage}")
            for i, cluster_id in enumerate(partition):
                clusters[cluster_id].append(i)

            logger.info(f"Clusters: {clusters}")
            for cluster in clusters.values():
                if len(cluster) > 1:
                    scores = confidences(cluster, condensed_distances, N)
                    logger.info(
                        f"Cluster Ids and scores: {tuple(i_to_id[i] for i in cluster)}, {scores}"
                    )
                    ids = [i_to_id[i] for i in cluster]
                    if id_to_match in ids and id_to_match is not None:
                        yield tuple(ids), scores
                    elif id_to_match is None:
                        yield tuple(ids), scores

        else:
            (ids, score), = sub_graph
            if score > score_threshold and id_to_match in ids and id_to_match is not None:
                # logger.info(tuple(ids), ((score,) * 2))
                yield tuple(ids), (score, ) * 2
            elif score > score_threshold and id_to_match is None:
                yield tuple(ids), (score, ) * 2

Ejemplo n.º 54

0

Mostrar archivo

Archivo: _mapper.py Proyecto: AdrianVasquez1/CodesandOtherStuffs

 def __call__(self, X):
     return linkage(X, method='average')

Ejemplo n.º 55

0

Mostrar archivo

    fname = os.path.basename(fname)
    if fname.endswith('.distances'):
        pdb = fname[:-10]
        index[pdb] = i
length = len(index)
npArray = np.zeros((length, length))
for filename in glob.glob('/home/lmt72/PDBdistances/*.distances'):
    distanceFile = open(filename)
    filename = os.path.basename(filename)
    if filename.endswith('.distances'):
        pdb = filename[:-10]
    for line in distanceFile:
        data = line.split()
        secondProtein = data[0]
        distance = float(data[1].strip())
        i = index[pdb]
        npArray[i, index[secondProtein]] = distance
        npArray[index[secondProtein], i] = distance
names = ['' for x in xrange(length)]
for (name, i) in index.iteritems():
    names[i] = name
print npArray

Z1 = fcl.linkage(npArray, method='average')
l1 = sch.leaves_list(Z1)
D = (npArray[l1])
Z2 = fcl.linkage(npArray.T, method='average')
l2 = sch.leaves_list(Z2)
D = D[:, l2]
cPickle.dump((npArray, D, Z1, names), open("clusterstate.pickle", 'w'), -1)

Ejemplo n.º 56

0

Mostrar archivo

Archivo: _mapper.py Proyecto: AdrianVasquez1/CodesandOtherStuffs

 def __call__(self, X):
     return linkage(X, method='ward')

Ejemplo n.º 57

0

Mostrar archivo

Archivo: _mapper.py Proyecto: AdrianVasquez1/CodesandOtherStuffs

 def __call__(self, X):
     return linkage(X, method='complete')

Ejemplo n.º 58

0

Mostrar archivo

num_points_per_bb = 50
data_generator = DataGenerator(bounding_boxes, num_points_per_bb, window)
points = data_generator.load_points_from_csv('points.csv')
#points = data_generator.generate_points()

canvas = Canvas(window, width=1024, height=768, bg='white')
colors = ['white', 'yellow', 'cyan', 'red', 'blue', 'brown', 'green']

np_points = np.zeros((len(points), 2))
for i in range(len(points)):
    np_points[i][0] = points[i].x
    np_points[i][1] = points[i].y

start_time = time.time()
np_clusters = fastcluster.linkage(np_points,
                                  method='single',
                                  metric='euclidean')

print(np_clusters)

clusters = []
for i in range(len(points)):
    cluster = Cluster(i)
    cluster.populate([points[i]])
    clusters.append(cluster)


def get_cluster_by_id(id):
    for i in range(len(clusters)):
        if clusters[i].id == id:
            return i

Ejemplo n.º 59

0

Mostrar archivo

Archivo: _mapper.py Proyecto: AdrianVasquez1/CodesandOtherStuffs

 def __call__(self, X):
     return linkage(X, method='single')

Ejemplo n.º 60

0

Mostrar archivo

Archivo: group_power_save_server.py Proyecto: dostos/snu-amc-s19-team7

    def __initial_match(self,
                        candidate_list: (np.ndarray, np.generic),
                        min_pts=2,
                        t=50,
                        criterion='distance'):
        # TODO group matching for non-grouped user
        # 1 : dbscan algorithm + gps based movement vector alignment -> clear!
        # 2 : acceleration -> let's discuss
        """Performs initial-clustering on cn candidate_list(nT x 2 numpy array) and returns group lists.
        Parameters
        ----------
        candidate_list : array of shape (n_samples, n_of_time_steps, pair of latitude and longitude
        min_pts : minimum members of a group for HDBSCAN-algorithm
        t : scalar
            For criteria 'inconsistent', 'distance' or 'monocrit',
            this is the threshold to apply when forming flat clusters.
            For 'maxclust' or 'maxclust_monocrit' criteria,
            this would be max number of clusters requested.
        criterion : str, optional
        The criterion to use in forming flat clusters. This can
        be any of the following values:

          ``inconsistent`` :
              If a cluster node and all its
              descendants have an inconsistent value less than or equal
              to `t` then all its leaf descendants belong to the
              same flat cluster. When no non-singleton cluster meets
              this criterion, every node is assigned to its own
              cluster. (Default)

          ``distance`` :
              Forms flat clusters so that the original
              observations in each flat cluster have no greater a
              cophenetic distance than `t`.

          ``maxclust`` :
              Finds a minimum threshold ``r`` so that
              the cophenetic distance between any two original
              observations in the same flat cluster is no more than
              ``r`` and no more than `t` flat clusters are formed.

          ``monocrit`` :
              Forms a flat cluster from a cluster node c
              with index i when ``monocrit[j] <= t``.

              For example, to threshold on the maximum mean distance
              as computed in the inconsistency matrix R with a
              threshold of 0.8 do::

                  MR = maxRstat(Z, R, 3)
                  cluster(Z, t=0.8, criterion='monocrit', monocrit=MR)

          ``maxclust_monocrit`` :
              Forms a flat cluster from a
              non-singleton cluster node ``c`` when ``monocrit[i] <=
              r`` for all cluster indices ``i`` below and including
              ``c``. ``r`` is minimized such that no more than ``t``
              flat clusters are formed. monocrit must be
              monotonic. For example, to minimize the threshold t on
              maximum inconsistency values so that no more than 3 flat
              clusters are formed, do::

                  MI = maxinconsts(Z, R)
                  cluster(Z, t=3, criterion='maxclust_monocrit', monocrit=MI)
        Returns
        ----------
        groups : list of shape (n_clusters, n_members)

        Examples
        ----------
        >>> candidate_list = np.array([,...,], shape=[5,3,2]) -> labels of candidate_list = [0,1,0,1,0]
        >>> groups = [[0,2,4],[1,3]]
        """
        assert isinstance(candidate_list, (np.ndarray, np.generic))
        num_of_data, num_time_steps, _ = candidate_list.shape
        X = np.array([
            candidate_list[i, num_time_steps - 1, :]
            for i in range(num_of_data)
        ])
        rads = np.radians(X)  # [N,2]
        # Clustering with gps-data of 1-time step.
        # 'haversine' do clustering using distance transformed from (lat, long)
        clusterer = hdbscan.HDBSCAN(min_cluster_size=min_pts,
                                    min_samples=2,
                                    metric='haversine')
        labels = clusterer.fit_predict(rads)
        print('Before trajectory clustering, labels are ', labels)
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        groups = []
        for ulb in range(n_clusters_):
            groups.append([])
        for i, lb in enumerate(labels):
            if lb == -1:
                continue
            groups[lb].append(i)
        total_n_clusters = n_clusters_
        # Group refinement considering user's trajectory
        for nc in range(n_clusters_):
            group_member_mask = (labels == nc)
            group_members = candidate_list[group_member_mask]
            pdist = tdist.pdist(group_members.transpose([0, 2, 1]),
                                metric="sspd",
                                type_d="spherical")
            Z = fc.linkage(pdist, method="ward")
            sub_labels = sch.fcluster(Z, t, criterion=criterion) - 1
            unique_sub_labels = len(set(sub_labels))
            if unique_sub_labels == 1:
                continue
            for ad in range(unique_sub_labels - 1):
                groups.append([])
            member_indices = list(
                compress(range(len(group_member_mask)), group_member_mask))
            for sb in range(unique_sub_labels):
                sub_group_mask = (sub_labels == sb)
                sub_member_indices = list(
                    compress(range(len(sub_group_mask)), sub_group_mask))
                # Noise case
                if len(sub_member_indices) == 1:
                    groups[nc].remove(member_indices[sub_member_indices[0]])
                    labels[member_indices[sub_member_indices[0]]] = -1
                    continue
                for m in range(len(sub_member_indices)):
                    # remove from wrong group
                    groups[nc].remove(member_indices[sub_member_indices[m]])
                    # add to refined group
                    groups[total_n_clusters].append(
                        member_indices[sub_member_indices[m]])
                    labels[member_indices[
                        sub_member_indices[m]]] = total_n_clusters
                total_n_clusters += 1
        print('After trajectory clustering, labels are ', labels)
        return groups.copy()