def cluster_fps(self):
        clkg = hcluster.linkage(self.dm,method = 'average') 
        coarse_r = hcluster.fcluster(clkg,0.3,criterion = 'distance')
        self.coarse_r = coarse_r

        bcount = np.bincount(coarse_r)
        knum = len(np.nonzero(bcount > 1)[0])

        s = self.density_matrix.shape
        if False and len(s) >1 and s[0] > 10 and s[1] > 10 and knum < min(s) / 2:
            (u,s,vt) = la.svds(self.sps_matrixs,k = knum)
            self.u = u
            print '============'
        else:
            
            self.result = self.coarse_r
            return (clkg,clkg)
 

#rankA = npla.matrix_rank(self.sps_matrixs)
#        if rankA < 3:
        a = np.matrix(np.diag(s)) * np.matrix(vt)
        pd = dist.pdist(np.array(a.T),'cosine')
        pd[np.abs(pd) < 1e-11] = 0
        lkg = hcluster.linkage(pd,method = 'average')
        self.lkg = lkg

        self.result = hcluster.fcluster(lkg,self.svd_cluster_thr,criterion = 'distance')

#        self.result = hcluster.fcluster(lkg,1)

# self.result = hcluster.fclusterdata(u,0.7,metric = 'cosine', criterion = 'distance',method = 'average')
        return (lkg,clkg)
Example #2
0
 def elbow(self, no_plot=False):
     """Plot within groups variance vs. number of clusters.
     Elbow criterion could be used to determine number of clusters.
     """
     from scipy.cluster.hierarchy import fcluster
     import matplotlib.pyplot as plt
     idx = fcluster(self.Z, len(self.data), criterion='maxclust')
     nclust = list(np.arange(1, np.sqrt(idx.max() / 2) + 1, dtype=int))
     within_grp_var = []
     mean_var = []
     for n in nclust:
         idx = fcluster(self.Z, n, criterion='maxclust')
         grp = [np.flatnonzero(idx == c) for c in np.unique(idx)]
         # between_grp_var = Group([self.data[ix].R.uv for ix in grp]).var
         var = [100*self.data[ix].var for ix in grp]
         within_grp_var.append(var)
         mean_var.append(np.mean(var))
     if not no_plot:
         plt.boxplot(within_grp_var, positions=nclust)
         plt.plot(nclust, mean_var, 'k')
         plt.xlabel('Number of clusters')
         plt.ylabel('Variance')
         plt.title('Within-groups variance vs. number of clusters')
         plt.show()
     else:
         return nclust, within_grp_var
Example #3
0
	def hcluster_cols(self, thresh):
		try:
			link = linkage(self.X.T, method='complete', metric = 'cosine')
			assignments = fcluster(link, thresh, 'distance')

		except:
			link = linkage(self.X.T, method='complete', metric = 'euclidean')
			assignments = fcluster(link, thresh, 'distance')

		col_ind = np.arange(len(self.crimes))
		d = pd.DataFrame(zip(col_ind, assignments)).groupby(1)[0].aggregate(lambda x: tuple(x))
		df_new = pd.DataFrame(index = np.arange(len(self.names)))
		for i in d:
			cols = []
			for w in i:
			    cols.append(w)
			if len(cols) > 1:
				df_new[str(self.crimes[cols])] = np.mean(self.X[:,cols], axis = 1)
			else:
			    df_new[str(self.crimes[cols[0]])] = self.X[:,cols[0]]

		# plt.figure(figsize=(10,20))
		# dendro = dendrogram(link, color_threshold=thresh, leaf_font_size=13, labels = self.crimes, orientation = 'left')
		# plt.subplots_adjust(top=.99, bottom=0.5, left=0.05, right=0.99)
		# plt.show()

		self.df = df_new
		self.crimes = df_new.columns.values
Example #4
0
def refineEnsemble(ens, lower=.5, upper=10.):
    """Refine a PDB ensemble based on RMSD criterions.""" 

    from scipy.cluster.hierarchy import linkage, fcluster
    from scipy.spatial.distance import squareform
    from collections import Counter

    ### calculate pairwise RMSDs ###
    RMSD = ens.getRMSDs(pairwise=True)

    # convert the RMSD table to the compressed form
    v = squareform(RMSD)

    ### apply upper threshold ###
    Z_upper = linkage(v, method='complete')
    labels = fcluster(Z_upper, upper, criterion='distance')
    most_common_label = Counter(labels).most_common(1)[0][0]
    I = np.where(labels==most_common_label)[0]

    ### apply lower threshold ###
    Z_lower = linkage(v, method='single')
    labels = fcluster(Z_lower, lower, criterion='distance')
    uniq_labels = np.unique(labels)

    clusters = []
    for label in uniq_labels:
        indices = np.where(labels==label)[0]
        clusters.append(indices)

    J = np.ones(len(clusters), dtype=int) * -1
    rmsd = None
    for i, cluster in enumerate(clusters):
        if len(cluster) > 0:
            # find the conformations with the largest coverage 
            # (the weight of the ref should be 1)
            weights = [ens[j].getWeights().sum() for j in cluster]
            js = np.where(weights==np.max(weights))[0]

            # in the case where there are multiple structures with the same weight,
            # the one with the smallest rmsd wrt the ens._coords is selected. 
            if len(js) > 1:
                # rmsd is not calulated unless necessary for the sake of efficiency
                rmsd = ens.getRMSDs() if rmsd is None else rmsd
                j = js[np.argmin(rmsd[js])]
            else:
                j = js[0]
            J[i] = cluster[j]
        else:
            J[i] = cluster[0]

    ### refine ensemble ###
    K = np.intersect1d(I, J)

    reens = ens[K]

    return reens
Example #5
0
def cutTree(z, threshold, crit):
    try:
        z = np.clip(z,0,9999999)
        tree = hac.fcluster(z, threshold, criterion = crit)
        return tree
    except ValueError, e:
        print("cutTree: %s" % str(e))
        tree = hac.fcluster(z, 50, criterion = "euclidean")
        print "negative values in matrix"
        return tree
Example #6
0
def process_stay(imei,traj):
#    print imei,'------------------------>',traj.shape
    r = 20
    interval = 60*8
#    wfs = wfs[:1000]
#    traj = traj[:1000]
    if len(traj.shape) < 1 or traj.shape[0] <2:
        return
    x = traj['x']
    y = traj['y']
    in_sample = False
#print x,y
    if sample_range is not None:
        for (cx,cy,cr) in sample_range:
            crange = math.sqrt(math.pow(cx-x[0],2) + math.pow(cy-y[0],2))
            if crange < cr:
                in_sample = True
                break
    #ids = grid_util.get_grid_ids(np.median(x),np.median(y),300,3)
        if not in_sample:
            return
    
    ids = G.get_gridids_with_align(np.median(x),np.median(y))
#
#    print traj
    dm = get_pdist(traj,100,convert_sig = True)
    dm[np.abs(dm) < 1e-3] = 0
#    print dm
#    print dm.shape
#lkg = hcluster.linkage(traj[...,:2],metric = 'euclidean',method = 'average')
#    print dm
#    print dm.shape
    lkg = hcluster.linkage(dm,method = 'average')
    rst = hcluster.fcluster(lkg,0.7,criterion = 'distance') #rough dist
    rst_merge = hcluster.fcluster(lkg,0.2,criterion = 'distance') #rough dist
    seg = []
    for i in range(len(rst) + 1):
        if i == 0 or i == len(rst) or rst[i] != rst[i-1]:
            seg.append(i)
#
#    print rst
#    print rst_merge
#    print seg
    for (s,e) in zip(seg[:-1],seg[1:]):
        seg_traj = traj[s:e]
        seg_id = rst_merge[s:e]
        itl = seg_traj[-1]['t'] - seg_traj[0]['t']
        if itl > interval:
            print_merge_fp(ids,imei,seg_traj,seg_id,itl) 
def clusterTrajectories(
    trajectories, fname, path, metric_func=trajectoryDissimilarityL2, user_distance_matrix=None, criterion="distance"
):
    """
	trajectories: the trajectories need to be in XY coordinates
	"""
    plot_path = utils.queryPath(path + "/plots")
    if user_distance_matrix is None:
        distance_matrix = getTrajectoryDistanceMatrix(trajectories, metric_func)
        writeToCSV.saveData(distance_matrix, path + "/" + fname)  # save the distance_matrix
    else:
        distance_matrix = user_distance_matrix
        assert len(distance_matrix) == len(
            trajectories
        ), "distance_matrix (n, n) and trajectories(n) should have same number of samples"

    print "distance_matrix:\n", distance_matrix

    v = DIST.squareform(distance_matrix)
    cluster_result = HAC.linkage(v, method="average")
    dg = HAC.dendrogram(cluster_result)
    plt.xlabel("cluster_dengrogram_{fname}".format(fname=fname))
    plt.savefig("{path}/cluster_dengrogram_{fname}.png".format(fname=fname, path=plot_path))
    plt.clf()

    if criterion == "distance":
        if metric_func == trajectoryDissimilarityL2:
            this_cluster_label = HAC.fcluster(
                Z=cluster_result, t=1 * 1000, criterion="distance"
            )  # distance for l2 measure
        elif metric_func == trajectoryDissimilarityCenterMass:
            this_cluster_label = HAC.fcluster(
                Z=cluster_result, t=1.5, criterion="distance"
            )  # distance for center of mass measure
    elif criterion == "inconsistent":
        this_cluster_label = HAC.fcluster(Z=cluster_result, t=0.8, criterion="inconsistent")

    print "this_cluster_label:", this_cluster_label, "number of clusters:", len(set(this_cluster_label))

    """Plot the representative trajectories"""
    plotRepresentativeTrajectory(
        this_cluster_label,
        trajectories,
        fname="cluster_centroids_{n}_classes".format(n=len(set(this_cluster_label))),
        path=plot_path,
        show=False,
    )

    return this_cluster_label, [this_cluster_label], []
Example #8
0
def hclustering(data, t):
    #row_dist = pd.DataFrame(squareform(pdist(data, metric='euclidean')))
    row_dist = np.corrcoef(data)
    #row_dist = data
    row_clusters = linkage(row_dist, method='ward')
    ind = fcluster(row_clusters, t, criterion='maxclust')
    return ind
Example #9
0
def plot_transition_clustermap(data_array, gene_names, pseudotimes, n_clusters=10, gradient=False):
    if gradient:
        data_to_plot = zscore(np.gradient(data_array)[1].T, axis=0)
        scale = None
        metric = 'seuclidean'
        row_linkage = linkage(pdist(abs(data_to_plot), metric=metric), method='complete')
    else:
        data_to_plot = data_array.T
        scale = 0
        metric = 'correlation'
        row_linkage = linkage(pdist(data_to_plot, metric=metric), method='complete')
    
    assignments = fcluster(row_linkage, n_clusters, criterion='maxclust')
    cm = sns.clustermap(data_to_plot, col_cluster=False, standard_scale=scale, 
                        yticklabels=gene_names, row_linkage=row_linkage,
                        row_colors=[settings.STATE_COLORS[i] for i in assignments])
    r = np.arange(10, data_array.shape[0], data_array.shape[0]/10)
    plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5)
    cm.ax_heatmap.set_xticks(r)
    cm.ax_heatmap.set_xticklabels(['%.1f' % x for x in pseudotimes[r]])
    cm.ax_heatmap.set_xlabel('Pseudotime')
    cm.ax_heatmap.set_ylabel('Gene')
    
    gene_clusters = defaultdict(list)
    for i, cl in enumerate(assignments):
        gene_clusters[settings.STATE_COLORS[cl]].append(gene_names[i])
    return gene_clusters
Example #10
0
File: tfe.py Project: cinai/ODtrips
def get_ROIs(df_sequence,x,limit_meters):
	# encontrar puntos de transacciones origen
	X,locations,pi_locations = get_latlong_points(df_sequence)
	if len(locations) == 1:
		return [[{"lat":X[0,0],"long":X[0,1]}],1.0]
	elif len(locations) < 1:
		return None
	# construir dendrograma
	Z = linkage(X,'weighted',lambda x,y: vincenty(x,y).meters)
	clusters = fcluster(Z,limit_meters,criterion='distance')
	centroids = []
	nums_by_clusters =[]
	pi_sums = []
	the_clusters = []
	# join pi_sums of locations that are in the same cluster
	for i in range(len(clusters)):
		indice = buscar_locacion(the_clusters,clusters[i])
		if indice < 0:
			the_clusters.append(clusters[i])
			indice = len(the_clusters)-1
			pi_sums.append(0)
			nums_by_clusters.append(0)
			centroids.append({"lat":0,"long":0})
		pi_sums[indice] += pi_locations[i]
		centroids[indice]["lat"] += X[i,0]
		centroids[indice]["long"] += X[i,1]
		nums_by_clusters[indice] += 1

	the_indexs, the_sum = get_upToX_pi_locations(np.asarray(pi_sums),x)
	the_centroids = []
	for i in the_indexs:
		the_centroids.append({"lat":centroids[i]["lat"]/nums_by_clusters[i],"long":centroids[i]["long"]/nums_by_clusters[i]})
	return [the_centroids,the_sum]
Example #11
0
def clusterize_hierarchical(peakels, matrix_dist, cut, clip=False):
    """

    :param clip:
    :param peakels:
    :param matrix_dist:
    :param method:
    :param cut:
    """
    #having negative value in the matrix distance
    # leading to a valueerror
    # clip i order to prevent negative value in the matrix distance
    if clip:
        np.clip(matrix_dist, 0, 1, matrix_dist)
    k = linkage(matrix_dist, method='complete')

    #dist = maxdists(k)
    #fit = norm.fit(dist)
    #cut = np.percentile(dist, 10.0)  #norm.ppf(5.0, loc=fit[0], scale=fit[1])

    k2 = fcluster(k, cut, criterion='distance')  #, criterion='distance')
    clust_by_id = ddict(list)
    for i, v in enumerate(k2):
        clust_by_id[v].append(peakels[i])
    return clust_by_id.values()
def hist_per_stagione(start=1992, end=2012):
	stagione=(all_labels > start) & (all_labels < end)
	dist_selected=dist[ix_(stagione,stagione)]
	Z=linkage(squareform(dist_selected),method='complete')
	n=choose_p(Z)
	c=fcluster(Z,n,criterion='maxclust')-1

	label_anni=all_labels[stagione]
	#order by first appearance!
	first_appearance=[]
	for i in range(0,n):
	    first_appearance.append(min(label_anni[c==i]))

	order1=[index for key,index in sorted(zip(first_appearance,range(0,n)))]
	order2=[index for key,index in sorted(zip(order1,range(0,n)))]
	order=array(order2)
	c=order[c]

	#draw scatter plot
	scatter(label_anni,c,s=100,c=c)
	#grid(b=True,axis='y')
	yticks(range(0,n+1))
	xlim((min(label_anni)-0.5,max(label_anni)+0.5))
	ax=gca()
	for i in range(1993,2011+1):
		ax.add_line(Line2D([i+7./12,i+7./12],[0,n+1],linestyle='--'))
	show()
Example #13
0
def main(): #clustering and write output
    if len(pep_array)>1:
        matrix=[]
        for i in range(0,len(pep_array)):
            matrix.append(pep_array[i][4].replace('\"',"").split(','))

        dataMatrix=numpy.array(matrix,dtype=float)
        d = sch.distance.pdist(dataMatrix,metric)# vector of pairwise distances
        if metric=="correlation":
            D = numpy.clip(d,0,2) #when using correlation, all values in distance matrix should be in range[0,2]
        else:
            D=d
        try:
            cutoff=float(t)
        except ValueError:
            print "please provide a numeric value for --t"; sys.exit()
        L = sch.linkage(D, method,metric)
        ind = sch.fcluster(L,cutoff,'distance')#distance is dissmilarity(1-correlation)
        p=numpy.array(pep_array)
        p=numpy.column_stack([p,ind])
        formatoutput(p)
    else:
        p=numpy.array(pep_array)
        p=numpy.column_stack([p,[0]])
        formatoutput(p)
Example #14
0
    def order(self, method='complete', metric='euclidean', inplace=False):
        """
        Rearrange the order of rows and columns after clustering

        :param method: any scipy method (e.g., single, average, centroid,
            median, ward). See scipy.cluster.hierarchy.linkage
        :param metric: any scipy distance (euclidean, hamming, jaccard)
            See scipy.spatial.distance or scipy.cluster.hieararchy
        :param bool inplace: if set to True, the dataframe is replaced

        You probably do not need to use that method. Use :meth:`plot` and
        the two parameters order_metric and order_method instead.
        """
        from scipy.cluster.hierarchy import fcluster, dendrogram
        Y = self.linkage(self.df, method=method, metric=metric)
        ind1 = fcluster(Y, 0.7 * max(Y[:, 2]), 'distance')
        Z = dendrogram(Y, no_plot=True)
        idx1 = Z['leaves']
        cor2 = self.df.ix[idx1][idx1]
        if inplace is True:
            self.df = cor2
        else:
            return cor2
        self.Y = Y
        self.Z = Z
        self.idx1 = idx1
        self.ind1 = ind1
Example #15
0
    def get_cluster(self,cluster_data):
        cluster_value = []
        for index in range(len(cluster_data)):
            if cluster_data[index] == True:
                cluster_value.append(index)

        dimension = len(cluster_value)
        distance_matrix=[[0 for row in range(dimension)] for col in range(dimension)]
        for row in range(dimension):
            for col in range(dimension):
                distance_matrix[row][col]=abs(cluster_value[row]-cluster_value[col])
        distance_array = distance.squareform(distance_matrix)
        clusters=hierarchy.linkage(distance_array, method='weighted', metric='euclidean')
        T = hierarchy.fcluster(clusters, self.cluster_distance, criterion='distance')
        temp_holder = {}
        for item in range(max(T)):
            temp_holder[item+1] = []

        for index in range(dimension):
            temp_holder[T[index]].append(cluster_value[index])

        # print temp_holder
        # print T
        # print cluster_value
        return temp_holder
Example #16
0
def optimal_cutoff(Y,dist_mat,min_size):
    labels = np.array([sch.fcluster(Y,c,criterion='distance') for c in Y[:,2]])
    score = np.array([metrics.silhouette_score(dist_mat,l) for l in labels[:-min_size]])
    c = Y[:-min_size,2]
    f = interp(c,-score,kind='linear')
    opt_c = opt.fmin(f,x0=c[2*min_size])
    return opt_c
    def _run_hier_clust_on_centroids(self,method='average'):
        '''
        runs hierarchical clustering based on the centroids of the data per scipy's methods

        '''

        uniqueLabels = np.sort(np.unique(self.templateLabels))
        centroids = np.array([self.templateMat[np.where(self.templateLabels == i)[0],:].mean(axis=0) for i in uniqueLabels])
               
        self.y = pdist(centroids)
        self.z = hierarchy.linkage(self.y,method)
        r2 = hierarchy.inconsistent(self.z,2)

        ## rank the average of linkage hieghts by standard deviation the report the averages
        meanHeights = r2[:,0]
        stdHeights = r2[:,1]
        rankedInds = np.argsort(stdHeights)[::-1]
        bestCutPoints = meanHeights[rankedInds]

        ## save centroid labels for all cuts of the dentragram
        allCentroidLabels = {}
        rankedK = []
        for cp in bestCutPoints:
            centroidLabels = hierarchy.fcluster(self.z,t=cp,criterion='distance')
            k = len(np.unique(centroidLabels))
            if allCentroidLabels.has_key(str(k)) == True:
                continue
            
            allCentroidLabels[str(k)] = centroidLabels 
            rankedK.append(k)
        
        centroidLabels = allCentroidLabels[str(rankedK[0])]
    
        ## save the top xx modes 
        self.bestModeLabels = []
        print 'doing ranking...'
       
        for rk in rankedK[:25]:
            centroidLabels = allCentroidLabels[str(rk)]
            modeLabels = self._get_mode_labels(self.templateLabels,centroidLabels,uniqueLabels)
            self.bestModeLabels.append(modeLabels)
        
        ## provide silvalue ranks in case we wish to reorder the top xx modes by sil value
        self.modeSilValues = []
        self.modeSizes = []
        allEvents = [self.templateData]

        for count in range(len(self.bestModeLabels)):
            numClusters = np.unique(self.bestModeLabels[count]).size
            silValues = get_silhouette_values(allEvents,[self.bestModeLabels[count]],subsample=self.noiseSample,
                                              minNumEvents=5000,resultsType='raw')
            silMean = silValues['0'].mean()
            self.modeSilValues.append(silValues['0'].mean())
            self.modeSizes.append(numClusters)

        silValues = get_silhouette_values(allEvents,[self.templateLabels],subsample=self.noiseSample,
                                          minNumEvents=5000,resultsType='raw')
        self.clusterSilValues = silValues['0'].mean()
        self.modeSilValues = np.array(self.modeSilValues)
        self.modeSizes = np.array(self.modeSizes)
Example #18
0
def user_fp_group(data,key,user,filter = 'mid',merge = False,thr = 0.2):
#data = np.fromiter(data,dtype = dt)
    if len(data.shape) == 0 or data.shape[0] == 1:
        print '\t'.join([key,user,'%s' % data['wf_list'],str(data['x']),str(data['y']),'1'])
        return
    dists = get_pdist(data,100)
#print dists
    clusters = hcluster.linkage(dists,method = 'average')
#   print clusters
    r = hcluster.fcluster(clusters,thr,'distance')
    ids = np.unique(r)
    sz = []
    for id in ids:
        sz.append(data[r==id].shape[0])
    
    mid_size = max(1.1,max(sz) / 2.0)
    for id in ids:
        d = data[r==id]
        if filter == 'mid' and d.shape[0] < mid_size:
            continue
        if merge == True:
            print '\t'.join([key,user,wf_to_str(get_mean_wf(d)),str(np.median(d['x'])),str(np.median(d['y'])),str(get_largest_dur(d)),str(d.shape[0])])
            continue
        for od in d:
            print '\t'.join([key,user,od['wf_list'],str(od['x']),str(od['y']),str(od['t']),str(id)])
def process(tag,infos,wf_lists,count):
    if wf_lists == None or infos == None:
        return

    x = infos['x']
    y = infos['y']
    imeis = infos['imei']
#wf_lists = np.fromiter(wf_lists,dtype = np.array)

    std_x = np.std(x)
    std_y = np.std(y)
    users_num = len(np.unique(imeis))
    if users_num < 3:
        return 
    if len(wf_lists.shape) < 2 or wf_lists.shape[1] < 2:
        return
    dists = sci_dist.pdist(wf_lists,'cosine')        
    dists[(dists < 1e-10)] = 0
    clusters = hierarchy.linkage(dists,method ='average')
    r = hierarchy.fcluster(clusters,0.3,'distance')

    for c in np.unique(r):
        idx = (r==c)
        c_x = np.median(x[idx] )
        c_y = np.median(y[idx] )
        c_std_x = np.std(x[idx])
        c_std_y = np.std(y[idx])
        c_user = len(np.unique(imeis[idx]))
        wfs = wf_lists[idx]
        wf =  np.sum(wfs,axis=0) / len(wfs)
        wf = [ '%d' % sig for sig in wf ]
        print '%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (tag,'\t'.join(wf),c_x,c_y,c_user,std_x,std_y,c_std_x,c_std_y,count)
def dunnindex_clusternumber(linkage,df_zscores, low=1, high=5,output_dir ="."):
     index_list=[]
     for n_clusters in range(low,high):

         assignments = hierarchy.fcluster(linkage, n_clusters, criterion="maxclust")
         df_assign_id = pd.DataFrame()

         df_assign_id['cluster_id'] = assignments
         clusters = np.unique(assignments)
         cluster_list = []  # for dunn index calculation

         for i in clusters:
            ids = np.nonzero(assignments == i)[0]  # starting from  0
            df_zscore_cluster = df_zscores.iloc[ids]
            cluster_list.append(df_zscore_cluster.values)

         dunn_index = dunn(cluster_list)
         print n_clusters, ":", dunn_index
         index_list.append(dunn_index)
     pl.figure()
     pl.plot(range(low,high),index_list,"*-")
     pl.xlabel("cluster number")
     pl.ylabel("dunn index")
     pl.savefig(output_dir+'/dunnindex_clusternumber.png')
     #pl.show()
     return
Example #21
0
def main(): #clustering and write output
    matrix=[]
    for i in range(0,len(proteinarray)):
        if calculate_ratio=="True":
            ratio_array=convert2ratio(proteinarray[i][2:],ref)
            matrix.append(ratio_array)
        else:
            matrix.append(proteinarray[i][2:])

    dataMatrix=np.array(matrix,dtype=float)
    if log_transform=="True":
        dataMatrix=np.log2(dataMatrix)

    if len(proteinarray)>1:
        d = sch.distance.pdist(dataMatrix,metric)# vector of pairwise distances
        if metric=="correlation":
            D = np.clip(d,0,2) #when using correlation, all values in distance matrix should be in range[0,2]
        else:
            D=d
        try:
            cutoff=float(t)
        except ValueError:
            print "please provide a numeric value for --t"; sys.exit()
        L = sch.linkage(D, method,metric)
        ind = sch.fcluster(L,cutoff,'distance')#distance is dissmilarity(1-correlation)
        p=np.array(proteinarray)[:,[0,1]] #slice first and second column of original data
        p=np.concatenate((p,dataMatrix),axis=1) # replace transformed data
        p=np.column_stack([p,ind]) # add cluster result to the last column
        formatoutput(p)
    else:
        p=np.array(proteinarray)[:,[0,1]]
        p=np.concatenate((p,dataMatrix),axis=1)
        p=np.column_stack([p,[0]])
        formatoutput(p)
Example #22
0
def create_hc(G, t=1.0):
    """
    Creates hierarchical cluster of graph G from distance matrix
    Maksim Tsvetovat ->> Generalized HC pre- and post-processing to work on labelled graphs and return labelled clusters
    The threshold value is now parameterized; useful range should be determined experimentally with each dataset
    """

    """Modified from code by Drew Conway"""
    
    ## Create a shortest-path distance matrix, while preserving node labels
    labels=G.nodes()    
    path_length=nx.all_pairs_shortest_path_length(G)
    distances=numpy.zeros((len(G),len(G))) 
    i=0   
    for u,p in path_length.items():
        j=0
        for v,d in p.items():
            distances[i][j]=d
            distances[j][i]=d
            if i==j: distances[i][j]=0
            j+=1
        i+=1
    
    # Create hierarchical cluster
    Y=distance.squareform(distances)
    Z=hierarchy.complete(Y)  # Creates HC using farthest point linkage
    # This partition selection is arbitrary, for illustrive purposes
    membership=list(hierarchy.fcluster(Z,t=t))
    # Create collection of lists for blockmodel
    partition=defaultdict(list)
    for n,p in zip(list(range(len(G))),membership):
        partition[p].append(labels[n])
    return list(partition.values())
def ward_cluster(df_all, feature_names, max_cluster_num, output_dir, swc_path= None, RemoveOutliers = 0,
                 datasetType='ivscc', plot_heatmap =1):
  print("\n\n\n  ***************  ward computation, max_cluster = %d  *************:" % max_cluster_num)

  if not os.path.exists(output_dir):
    os.mkdir(output_dir)

  ##### zscores  featuer plots
  df_zscores, df_all_outlier_removed, df_outliers = get_zscore_features(df_all, feature_names,
      output_dir + '/zscore.csv', RemoveOutliers)
  if (df_outliers.shape[0] > 0 ):
    output_single_cluster_results(df_outliers, output_dir, "outliers", swc_path)

  if plot_heatmap:
      if datasetType =='ivscc':
          link = heatmap_plot_zscore_ivscc(df_zscores, df_all_outlier_removed, output_dir, "feature zscores")
      if datasetType =='bbp':
          link = heatmap_plot_zscore_bbp(df_zscores, df_all_outlier_removed, output_dir, "feature zscores")
      if datasetType =='bigneuron':
          link = heatmap_plot_zscore_bigneuron(df_zscores, df_all_outlier_removed, output_dir, "feature zscores")
  else:
      link = hierarchy.linkage(df_zscores, method='ward', metric='euclidean')


  assignments = hierarchy.fcluster(link, max_cluster_num, criterion="maxclust")
  output_clusters(assignments, df_zscores, df_all_outlier_removed, feature_names, output_dir, swc_path)

  truncate_dendrogram(link,max_cluster_num,output_dir,0)

  return link,df_zscores
Example #24
0
def flatcluster(
    dRow, runLogs, interClusterDistance="complete", plotDendrogram=True, cMethod="inconsistent", cValue=2.5
):
    #    if 'inter-cluster distance' in clusterSetup.keys():
    #        method = clusterSetup['inter-cluster distance']
    #    else:
    #        method = 'complete'

    z = linkage(dRow, interClusterDistance)
    inc = inconsistent(z)
    # print inc

    if plotDendrogram:
        plotdendrogram(z)

    clusters = fcluster(z, cValue, cMethod)

    noClusters = max(clusters)
    print("Total number of clusters:", noClusters)
    for i in range(noClusters):
        counter = 0
        for j in range(len(clusters)):
            if clusters[j] == (i + 1):
                counter += 1
        print("Cluster", str(i + 1), ":", str(counter))

    global clusterCount
    clusterCount = noClusters
    print(len(clusters))
    print(len(runLogs))
    for i, log in enumerate(runLogs):
        log[0]["Cluster"] = str(clusters[i])

    return z, clusters, runLogs
Example #25
0
def run_ngram_model(cdev, cprc):
    print '____________________________________________________'
    print 'running n-gram model'
    wcorp = []
    for i in cprc:
        wcorp.append(' '.join(cprc[i]['words']))
        
    vectorizer = CountVectorizer(analyzer='word', binary=True, min_df=max(int(len(wcorp)*0.0005), 5), ngram_range=(2,3))
    X = vectorizer.fit_transform(wcorp)
    Xclean, mapping = filter_rare(X)
    
    Xdense = np.matrix(Xclean).astype('float')
    X_scaled = preprocessing.scale(Xdense)
    X_normalized = preprocessing.normalize(X_scaled, norm='l2')
    
    textMatrix = pairwise_distances(X_normalized, metric='cosine')
    L = fastcluster.linkage(textMatrix, method='average')
    flat_textclust = hierarchy.fcluster(L, 0.5, 'distance')
    ttc = organize_clusters(flat_textclust)
    
    ncf = []
    for cl in ttc:
        ncf.append([mapping[t] for t in cl])
    print 'detected', len(ncf), 'n-gram clusters'     
    return ncf
Example #26
0
def cluster_words(k):

    ts = os.listdir('types')
    ts.sort(key=alphanum_key)
    ts = np.array(ts)

    T = fcluster(Z,k,criterion='maxclust')

    def words(i):
        cluster = ts[T == i]
        print(len(cluster))
        allwords = []

        for t in cluster:
            fname = 'types/{}'.format(t)
            with open(fname) as file:
                data = json.loads(file.read())
            desc = data['description']
            words = re.findall('\w+', desc.lower())
            allwords.extend(words)

        allwords = [word for word in allwords if word not in stop_words]

        counts = Counter(allwords)
        return counts

    return [words(i+1) for i in range(k)]
Example #27
0
def run_entity_model(cdev, cprc):
    print '____________________________________________________'
    print 'running entity model'
    hdev, hprc, hmapping, entcorp, er = process_entities(cdev, cprc)
    print 'removed', len(cdev)- len(hdev), 'documents', len(hdev), 'left'
    voc = build_voc(entcorp, 2)
    
    ent_vectorizer = CountVectorizer(vocabulary = voc)
    E = ent_vectorizer.fit_transform(hdev)
    
    Eclean, emapping = filter_rare(E, 0)

    E_dense = np.matrix(Eclean).astype('float')
    E_scaled = preprocessing.scale(E_dense)
    E_normalized = preprocessing.normalize(E_scaled, norm='l2')
    
    EMatrix = pairwise_distances(E_normalized, metric='cosine')
    EL = fastcluster.linkage(EMatrix, method='average')
    flat_eclust = hierarchy.fcluster(EL, 0.5, 'distance')
    ec = organize_clusters(flat_eclust, th = 3)
    
    ecf = []
    for cl in ec:
        ecf.append([hmapping[emapping[t]] for t in cl])
    print 'detected', len(ecf), 'entity clusters'      
    return ecf, voc
Example #28
0
def main():
#     distMatrix = loadDistanceMatrix()
#    linkage = saveLinkage(distMatrix)
#     linkage = loadLinkage()
#     loadFCluster()
#     R = dendrogram(linkage, truncate_mode='level',  p=4, show_contracted=True)
#     afile = open(r'/home/rojosewe/Dropbox/MAI90/tesis/structs/R5000.pkl', 'wb')
#     pickle.dump(R, afile);
#     afile.close();
    linkage = loadLinkage()
    print len(linkage)
    k = 1.5
#   18 -> 54 
#   19 -> 46 
    
    R = dendrogram(linkage, color_threshold=6.8, show_contracted=True)
    pylab.savefig( "/home/rojosewe/Dropbox/MAI90/tesis/images/wordClustering/dgram446.8.png" )
#    print "cheese!"
    T = sch.fcluster(linkage, k, 'distance')
    n = len(T)
 #   print len(T)
    # calculate labels
    labels = np.zeros((n, 1))
    print str(k) + ": " + str(max(T))
    for i in range(n):
        labels[i,0] = int(T[i]);
    with open(datafolder + 'labels.csv', 'wb') as csvfile:
        csvw = csv.writer(csvfile);
        for i in range(n):
            csvw.writerow(labels[i,:])
            
    print 'done writing'
def clustering_scipy_dendrogram(features, n_clust, metric='euclidean', method = 'complete'):
  """
  """
  #x = pdist(features, metric)
  z = hac.linkage(features, method = method)
  #d = hac.dendrogram(z, p=30, truncate_mode=None, color_threshold=None, get_leaves=True, orientation='top', labels=None, count_sort=False, distance_sort=False, show_leaf_counts=True, no_plot=False, no_labels=False, color_list=None, leaf_font_size=None, leaf_rotation=None, leaf_label_func=None, no_leaves=False, show_contracted=False, link_color_func=None)
  #plt.show()
  
  #num_col = d['color_list']
  #cnt = Counter(num_col)
  #print cnt
  
  #n_clust = 100
  clusters = hac.fcluster(z, n_clust, criterion='maxclust')
  #print clusters
  
  num_elem = Counter(clusters)
  print num_elem
  
  centroids = to_codebook(features, clusters)
  #print temp
  #for i in range(len(temp)):
    #plt.plot(temp[i])
  
  #fig = plt.figure()
  #for ii in range(len(centroids)):
    #plt.subplot(4,2,ii)
    #plt.plot(centroids[ii])
    #plt.ylabel(np.array(ii+1))
  #plt.show()
  
  np.save('centroids',np.array(centroids))
  
  return clusters, centroids
Example #30
0
    def run_hierarchical(
        self,
        dm,
        nclusters,
        linkage_method,
        ):

        if dm.metric == 'rf':
            matrix = dm.add_noise(dm.matrix)
        else:
            matrix = dm.matrix

        linkmat = linkage(matrix, linkage_method)
        linkmat_size = len(linkmat)
        if nclusters <= 1:
            br_top = linkmat[linkmat_size - nclusters][2]
        else:
            br_top = linkmat[linkmat_size - nclusters + 1][2]
        if nclusters >= len(linkmat):
            br_bottom = 0
        else:
            br_bottom = linkmat[linkmat_size - nclusters][2]
        threshold = 0.5 * (br_top + br_bottom)
        T = fcluster(linkmat, threshold, criterion='distance')
        T = self.order(T)
        return T
Example #31
0
 def retrieve_cluster(self, number):
     """rr"""
     self.clusters = sch.fcluster(self.linkage,
                                  number,
                                  criterion='maxclust')
     return
Example #32
0
def linkage(dmat):
    square = squareform(dmat) #needed for linkage methdos
    linkmat = sch.single(square)
    return sch.fcluster(linkmat,0.0001)
Example #33
0
def addZone(poi, max_d):
    """assign a zone to the poi, clustering"""
    Z = linkage(poi[['x', 'y']], 'ward')
    zoneL = fcluster(Z, max_d, criterion='distance')
    # newZone = np.isnan(poi['id_zone'])
    return zoneL
Example #34
0
def perform_inference(model, points, inferenceMethods, threshDict, metricsForEval, scaleDist):
	"""
	
	:param model:
	:param points: Dictionary with key as pid and value as (pointVector, clusterId)
	:param inferenceMethods:
	:param threshDict:
	:param metricsForEval:
	:param scaleDist:
	:return:
	"""
	torch.cuda.empty_cache()
	start = time.time()

	pidList 	= sorted(list(points.keys()))
	pointList 	= [points[pid][0] for pid in pidList]
	pidToGtClust= {pid:points[pid][1] for pid in pidList}
	gtList 		= {points[pid][1]:0 for pid in pidList}
	for pid in pidList:
		gtList[points[pid][1]] += 1
		
	numPoints 	= len(pointList)

	results = {}
	transformedPointList = None
	numComponents = 0
	
	if isinstance(model, MahalanobisDist):  # Tranform points if using Mahalanobis distance
		transformedPointList = model.transformPoints(pointList)
		linkMetric = "euclidean"
	else:
		raise Exception("Can not perform inference in this function with model type={}".format(type(model)))

	dendPurity = 0
	torchDistMat = model.batchForwardWithin(pointList)
	distMat_NP 	 = torchDistMat.cpu().data.numpy()
	
	y_true = []
	for idx, pid in enumerate(pidList):
		y_true.append(pidToGtClust[pid])
		
	for method in inferenceMethods:
		mStart = time.time()
		if method == "connComp":
			t1 = time.time()
			connCompThresh = threshDict["connComp"]
			sparseMatrix = comp_sparse_adj_mat(model, pointList, connCompThresh)
			t2 = time.time()
			print("Time taken for computing sparseMatrix:{:.3f}".format(t2-t1))
			
			x = connected_components(sparseMatrix)
			numComponents 		= x[0]
			connectedComponents = x[1]

			y_pred = []
			for idx,pid in enumerate(pidList):
				y_pred.append( connectedComponents[idx] )
		elif method == "recSparsest":
			
			labels = np.array([points[pid][1] for pid in pidList])
			new_dist_mat_NP =np.max(distMat_NP) - distMat_NP
			linkTree = run_sparsest_cut(new_dist_mat_NP, labels)
			y_pred = y_true
			if "dendPurity" in metricsForEval:
				dendPurity = calc_dend_purity(linkTree=linkTree, pidList=pidList, y_true=y_true)
			
		elif method == "random":
			y_pred, dendPurity = run_random_split(pidToCluster=pidToGtClust, k=len(gtList))

		elif method.startswith("linkage"):
			if method == "linkage_min" or method == "linkage_max":
				# raise Exception("Use singleLink or compLink inference method instead")
				linkageAlpha = method[-3:]
				flatClusters, dendPurity = runHAC(origDistMat=distMat_NP, k=len(gtList), linkAlpha=linkageAlpha, numPoints=numPoints, pidToCluster=pidToGtClust, threshold=None, scaleDist=scaleDist)
				y_pred = flatClusters
			elif method == "linkage_min@t" or method == "linkage_max@t":
				# raise Exception("Use singleLink or compLink inference method instead")
				linkageAlpha = method[-5:-2]
				threshold = threshDict[method]
				flatClusters, dendPurity = runHAC(origDistMat=distMat_NP, k=None, linkAlpha=linkageAlpha, numPoints=numPoints, pidToCluster=None, threshold=threshold, scaleDist=scaleDist)
				y_pred = flatClusters
			else:
				if method.startswith("linkage_auto"):
					if hasattr(model, "linkAlpha"):
						linkageAlpha = float(model.linkAlpha.data.cpu().numpy()[0])
					else:
						print("Not evaluating for method = {}".format(method, str(model)))
						continue
				else:
					try:
						if method.endswith("@t"):
							linkageAlpha = float(method[:-2].split("_")[-1])
						else:
							linkageAlpha = float(method.split("_")[-1])
					except:
						raise Exception("Invalid value of linkageAlpha = {}. Eg use method=linkage_1.0".format(method))
				
				if method.endswith("@t"): # Use a threshold to get flat clusters
					threshold = threshDict[method]
					flatClusters, dendPurity = runHAC_allEdges(origDistMat=distMat_NP, k=None, linkAlpha=linkageAlpha, numPoints=numPoints, pidToCluster=None, threshold=threshold, scaleDist=scaleDist)
				else: # Use number of gt clusters to get flat clusters
					if "dendPurity" in metricsForEval:
						flatClusters, dendPurity = runHAC_allEdges(origDistMat=distMat_NP, k=len(gtList), linkAlpha=linkageAlpha, numPoints=numPoints, pidToCluster=pidToGtClust, threshold=None, scaleDist=scaleDist)
					else: # No need to pass pidToCluster as we don't need to compute dendPurity
						flatClusters, dendPurity = runHAC_allEdges(origDistMat=distMat_NP, k=len(gtList), linkAlpha=linkageAlpha, numPoints=numPoints, pidToCluster=None, threshold=None, scaleDist=scaleDist)
						
				y_pred = flatClusters
				
				# ptToPredClusters = {point:y_pred[ctr] for ctr,point in enumerate(pointList)}
				# print("Plotting in file=",method + ".pdf")
				# plot_clusters(pointToCluster=ptToPredClusters,filename=method + ".pdf")
				# ptToGtClusters = {point:y_true[ctr] for ctr,point in enumerate(pointList)}
				# plot_clusters(pointToCluster=ptToGtClusters,filename=method + "_orig.pdf")
				
				
		else:
			if method.startswith("singleLink"):
				threshold = threshDict["singleLink@t"] if "singleLink@t" in threshDict else None
				linkTree = linkage(transformedPointList, "single",metric=linkMetric)
			elif method.startswith("avgLink"):
				threshold = threshDict["avgLink@t"] if "avgLink@t" in threshDict else None
				linkTree = linkage(transformedPointList, "average",metric=linkMetric)
			elif method.startswith("compLink"):
				threshold = threshDict["compLink@t"] if "compLink@t" in threshDict else None
				linkTree = linkage(transformedPointList, "complete",metric=linkMetric)
			else:
				linkTree = None
				print("Invalid inference method:{}".format(method))
				raise Exception("Invalid inference method:{}".format(method))
			
			if method.endswith("@t"):
				flatClusters = fcluster(Z=linkTree, t=threshold, criterion="distance")
			else:
				flatClusters = fcluster(Z=linkTree, t=len(gtList), criterion="maxclust")
				
			y_pred = flatClusters
			# ptToPredClusters = {point:y_pred[ctr] for ctr,point in enumerate(pointList)}
			# plot_clusters(pointToCluster=ptToPredClusters,filename=method + ".pdf")
			
			if "dendPurity" in metricsForEval:
				dendPurity = calc_dend_purity(linkTree=linkTree, pidList=pidList, y_true=y_true)
			
		
		mEnd = time.time()
		print("Time taken by inference method:{} = {:.3f}".format(method, mEnd - mStart))
		if "f1" in metricsForEval:
			tempResult = comp_prec_rec_f1(y_true, y_pred)
			for metric in tempResult:
				results[method + "_" + metric] 	= tempResult[metric]
		if "randIndex" in metricsForEval:
			results[method + "_randIndex"] = adjusted_rand_score(y_true, y_pred)
		if "nmi" in metricsForEval:
			results[method + "_nmi"] = adjusted_mutual_info_score(y_true, y_pred, average_method="arithmetic")
		if "dendPurity" in metricsForEval:
			results[method + "_dendPurity"] = 0 if method == "connComp" else dendPurity
		
	print("Inference Time:{:.3f} on {} points".format(time.time() - start,numPoints))
	return results
Example #35
0
	def _cluster_by_monocrit(linkage_table: numpy.ndarray, cutoff: float, inconsistent: pandas.DataFrame) -> numpy.ndarray:
		MR = hierarchy.maxRstat(linkage_table, inconsistent.values, 1)
		clusters = hierarchy.fcluster(linkage_table, t = cutoff, criterion = 'monocrit', monocrit = MR)
		return clusters
Example #36
0
    def gen_data(n_per_cat):
        cov = np.eye(2)*0.2
        X0 = np.random.multivariate_normal([-2.0, 0.0], cov, n_per_cat)
        X1 = np.random.multivariate_normal([2.0, 0.0], cov, n_per_cat)
        X2 = np.random.multivariate_normal([0.0, 1.8], cov, n_per_cat)

        data = np.vstack((X0, X1, X2))
        return data

    hypers = {
        'mu_0': np.zeros(2),
        'nu_0': 3.0,
        'lambda_0': 1.0,
        'psi_0': np.eye(2)
    }
    data_model = NormalInverseWishart(**hypers)

    # Sanity check: grab the assignment that has three components and do a
    # visual verification.
    data = gen_data(15)
    linkage_matrix = bhc(data, data_model)
    print(linkage_matrix)
    # print('len of lml',len(lmls),'len of data',len(asgn[0]))
    # print(makeLinkageMatrix(asgn,lmls))
    dn = dendrogram(linkage_matrix)
    plt.show()
    z = fcluster(linkage_matrix, 3, 'maxclust')
    plt.figure(tight_layout=True, facecolor='white')
    plt.scatter(data[:, 0], data[:, 1], c=z, cmap='Set1', s=225)
    plt.show()
Example #37
0
def make_figure(df, pa):
    """Generates figure.

    Args:
        df (pandas.core.frame.DataFrame): Pandas DataFrame containing the input data.
        pa (dict): A dictionary of the style { "argument":"value"} as outputted by `figure_defaults`.

    Returns:
        A Plotly figure.
        A Pandas DataFrame with columns clusters.
        A Pandas DataFrame with rows clusters.
        A Pandas DataFrame as displayed in the the Maptlotlib figure.

    """

    #fig = go.Figure( )
    #fig.update_layout( width=pa_["fig_width"], height=pa_["fig_height"] ) #  autosize=False,

    tmp = df.copy()
    tmp.index = tmp[pa["xvals"]].tolist()
    tmp = tmp[pa["yvals"]]

    if pa["add_constant"] != "":
        tmp = tmp + float(pa["add_constant"])

    if pa["log_transform_value"] == "log2":
        tmp = np.log2(tmp)
    elif pa["log_transform_value"] == "log10":
        tmp = np.log10(tmp)

    pa_ = {}

    checkboxes = [
        "row_cluster", "col_cluster", "xticklabels", "yticklabels",
        "row_dendogram_dist", "col_dendogram_dist", "reverse_color_scale"
    ]  # "robust"
    for c in checkboxes:
        if (pa[c] == "on") | (pa[c] == ".on"):
            pa_[c] = True
        else:
            pa_[c] = False

    for v in [
            "col_color_threshold", "row_color_threshold", "upper_value",
            "center_value", "lower_value"
    ]:
        if pa[v] == "":
            pa_[v] = None
        else:
            pa_[v] = float(pa[v])

    if pa_["reverse_color_scale"]:
        pa_["colorscale_value"] = pa["colorscale_value"] + "_r"
    else:
        pa_["colorscale_value"] = pa["colorscale_value"]

    selfdefined_cmap = True
    for value in [
            "lower_value", "center_value", "upper_value", "lower_color",
            "center_color", "upper_color"
    ]:
        if pa[value] == "":
            selfdefined_cmap = False
            break
    if selfdefined_cmap:
        range_diff = float(pa["upper_value"]) - float(pa["lower_value"])
        center = float(pa["center_value"]) - float(pa["lower_value"])
        center = center / range_diff

        color_continuous_scale=[ [0, pa["lower_color"]],\
            [center, pa["center_color"]],\
            [1, pa["upper_color"] ]]

        pa_["colorscale_value"] = color_continuous_scale

    if pa["zscore_value"] == "row":
        tmp = pd.DataFrame(stats.zscore(tmp, axis=1, ddof=1),
                           columns=tmp.columns.tolist(),
                           index=tmp.index.tolist())
    elif pa["zscore_value"] == "columns":
        tmp = pd.DataFrame(stats.zscore(tmp, axis=0, ddof=1),
                           columns=tmp.columns.tolist(),
                           index=tmp.index.tolist())

    if len(pa["findrow"]) > 0:
        rows_to_find = pa["findrow"]

        possible_rows = tmp.index.tolist()
        not_found = [s for s in rows_to_find if s not in possible_rows]
        if len(not_found) > 0:
            message = "˜The following rows could not be found: %s. Please check your entries for typos." % (
                ", ".join(not_found))
            flash(message, 'error')

        rows_to_plot = [] + rows_to_find

        if (pa["findrowup"] != "") | (pa["findrowdown"] != ""):

            d = scs.distance.pdist(tmp, metric=pa["distance_value"])
            d = squareform(d)
            d = pd.DataFrame(d,
                             columns=tmp.index.tolist(),
                             index=tmp.index.tolist())
            d = d[rows_to_find]

            for r in rows_to_find:
                dfrow = d[[r]]

                if pa["findrowtype_value"] == "percentile":

                    row_values = dfrow[r].tolist()

                    if pa["findrowup"] != "":
                        upperc = np.percentile(row_values,
                                               float(pa["findrowup"]))
                        upperc = dfrow[dfrow[r] >= upperc]
                        rows_to_plot = rows_to_plot + upperc.index.tolist()

                    if pa["findrowdown"] != "":
                        downperc = np.percentile(row_values,
                                                 float(pa["findrowdown"]))
                        downperc = dfrow[dfrow[r] <= downperc]
                        rows_to_plot = rows_to_plot + downperc.index.tolist()

                if pa["findrowtype_value"] == "n rows":
                    dfrow = dfrow.sort_values(by=[r], ascending=True)
                    row_values = dfrow.index.tolist()

                    if pa["findrowdown"] != "":
                        rows_to_plot = rows_to_plot + row_values[:int(
                            pa["findrowdown"])]

                    if pa["findrowup"] != "":
                        rows_to_plot = rows_to_plot + row_values[
                            -int(pa["findrowup"]):]

                if pa["findrowtype_value"] == "absolute":

                    if pa["findrowup"] != "":
                        upperc = dfrow[dfrow[r] >= float(pa["findrowup"])]
                        rows_to_plot = rows_to_plot + upperc.index.tolist()

                    if pa["findrowdown"] != "":
                        downperc = dfrow[dfrow[r] <= float(pa["findrowdown"])]
                        rows_to_plot = rows_to_plot + downperc.index.tolist()

                rows_to_plot = list(set(rows_to_plot))

        tmp = tmp[tmp.index.isin(rows_to_plot)]

    data_array = tmp.values
    data_array_ = tmp.transpose().values
    labels = tmp.columns.tolist()
    rows = tmp.index.tolist()

    # # Initialize figure by creating upper dendrogram
    if pa_["col_cluster"]:
        fig = ff.create_dendrogram(data_array_, orientation='bottom', labels=labels, color_threshold=pa_["col_color_threshold"],\
                                distfun = lambda x: scs.distance.pdist(x, metric=pa["distance_value"]),\
                                linkagefun= lambda x: sch.linkage(x, pa["method_value"]))
        for i in range(len(fig['data'])):
            fig['data'][i]['yaxis'] = 'y2'
        dendro_leaves_y_labels = fig['layout']['xaxis']['ticktext']
        #dendro_leaves_y = [ labels.index(i) for i in dendro_leaves_y_labels ]

        #for data in dendro_up['data']:
        #    fig.add_trace(data)

        if pa_["col_color_threshold"]:
            d = scs.distance.pdist(data_array_, metric=pa["distance_value"])
            Z = sch.linkage(d, pa["method_value"])  #linkagefun(d)
            max_d = pa_["col_color_threshold"]
            clusters_cols = fcluster(Z, max_d, criterion='distance')
            clusters_cols = pd.DataFrame({
                "col": tmp.columns.tolist(),
                "cluster": list(clusters_cols)
            })
        else:
            clusters_cols = pd.DataFrame({"col": tmp.columns.tolist()})

    else:
        fig = go.Figure()
        dendro_leaves_y_labels = tmp.columns.tolist()
    dendro_leaves_y = [labels.index(i) for i in dendro_leaves_y_labels]

    # Create Side Dendrogram
    if pa_["row_cluster"]:
        dendro_side = ff.create_dendrogram(data_array, orientation='right', labels=rows, color_threshold=pa_["row_color_threshold"],\
                                            distfun = lambda x: scs.distance.pdist(x, metric=pa["distance_value"]),\
                                            linkagefun= lambda x: sch.linkage(x, pa["method_value"] ))
        for i in range(len(dendro_side['data'])):
            dendro_side['data'][i]['xaxis'] = 'x2'
        dendro_leaves_x_labels = dendro_side['layout']['yaxis']['ticktext']
        #dendro_leaves_x = [ rows.index(i) for i in dendro_leaves_x_labels ]

        if pa_["row_color_threshold"]:
            d = scs.distance.pdist(data_array, metric=pa["distance_value"])
            Z = sch.linkage(d, pa["method_value"])  #linkagefun(d)
            max_d = pa_["row_color_threshold"]
            clusters_rows = fcluster(Z, max_d, criterion='distance')
            clusters_rows = pd.DataFrame({
                "col": tmp.index.tolist(),
                "cluster": list(clusters_rows)
            })
        else:
            clusters_rows = pd.DataFrame({"col": tmp.index.tolist()})

        #if pa_["col_cluster"]:
        # Add Side Dendrogram Data to Figure
        #print(dendro_side['data'][0])
        for data in dendro_side['data']:
            fig.add_trace(data)
        #else:
        #    fig=dendro_side

    else:
        dendro_leaves_x_labels = tmp.index.tolist()
    dendro_leaves_x = [rows.index(i) for i in dendro_leaves_x_labels]

    if pa["robust"] != "":
        vals = tmp.values.flatten()
        up = np.percentile(vals, 100 - float(pa["robust"]))
        down = np.percentile(vals, float(pa["robust"]))
        tmp[tmp > up] = up
        tmp[tmp < down] = down
        data_array = tmp.values

    # Create Heatmap
    heat_data = data_array
    heat_data = heat_data[dendro_leaves_x, :]
    heat_data = heat_data[:, dendro_leaves_y]

    heatmap = [
        go.Heatmap(x=dendro_leaves_x_labels,
                   y=dendro_leaves_y_labels,
                   z=heat_data,
                   zmax=pa_["upper_value"],
                   zmid=pa_["center_value"],
                   zmin=pa_["lower_value"],
                   colorscale=pa_['colorscale_value'],
                   colorbar={
                       "title": {
                           "text": pa["color_bar_label"],
                           "font": {
                               "size": float(pa["color_bar_font_size"])
                           }
                       },
                       "lenmode": "pixels",
                       "len": float(pa["fig_height"]) / 4,
                       "xpad": float(pa["color_bar_horizontal_padding"]),
                       "tickfont": {
                           "size": float(pa["color_bar_ticks_font_size"])
                       }
                   })
    ]

    if pa_["col_cluster"]:
        heatmap[0]['x'] = fig['layout']['xaxis']['tickvals']
    else:
        heatmap[0]['x'] = dendro_leaves_y_labels

    if pa_["row_cluster"]:
        heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals']
    else:
        fake_vals = []
        i = 0
        for f in range(len(dendro_leaves_x_labels)):
            fake_vals.append(i)
            i += 1
        #dendro_leaves_x_labels=tuple(fake_vals)
        heatmap[0]['y'] = tuple(fake_vals)  #dendro_leaves_x_labels

    # Add Heatmap Data to Figure
    # if (pa_["col_cluster"]) | (pa_["row_cluster"]):
    for data in heatmap:
        fig.add_trace(data)
    # else:
    #     fig = go.Figure(data=heatmap[0])

    # Edit Layout
    fig.update_layout({
        'width': float(pa["fig_width"]),
        'height': float(pa["fig_height"]),
        'showlegend': False,
        'hovermode': 'closest',
        "yaxis": {
            "mirror": "allticks",
            'side': 'right',
            'showticklabels': pa_["xticklabels"],
            'ticktext': dendro_leaves_x_labels
        },
        "xaxis": {
            "mirror": "allticks",
            'side': 'right',
            'showticklabels': pa_["yticklabels"],
            'ticktext': dendro_leaves_y_labels
        }
    })

    # Edit xaxis
    fig.update_layout(xaxis={'domain': [ float(pa["row_dendogram_ratio"]), 1],
                                    'mirror': False,
                                    'showgrid': False,
                                    'showline': False,
                                    'zeroline': False,
                                    'showticklabels': pa_["yticklabels"],
                                    "tickfont":{"size":float(pa["yaxis_font_size"])},
                                    'ticks':"",\
                                    'ticktext':dendro_leaves_y_labels})

    # Edit xaxis2
    if pa_["row_cluster"]:
        fig.update_layout(
            xaxis2={
                'domain': [0, float(pa["row_dendogram_ratio"])],
                'mirror': False,
                'showgrid': False,
                'showline': False,
                'zeroline': False,
                'showticklabels': pa_["row_dendogram_dist"],
                'ticks': ""
            })

    # Edit yaxis
    fig.update_layout(yaxis={'domain': [0, 1-float(pa["col_dendogram_ratio"]) ],
                                    'mirror': False,
                                    'showgrid': False,
                                    'showline': False,
                                    'zeroline': False,
                                    'showticklabels': pa_["xticklabels"],
                                    "tickfont":{"size":float(pa["xaxis_font_size"])} ,
                                    'ticks': "",\
                                    'tickvals':heatmap[0]['y'],\
                                    'ticktext':dendro_leaves_x_labels})
    #'tickvals':dendro_side['layout']['yaxis']['tickvals'],\
    # Edit yaxis2 showticklabels
    if pa_["col_cluster"]:
        fig.update_layout(
            yaxis2={
                'domain': [1 - float(pa["col_dendogram_ratio"]), 1],
                'mirror': False,
                'showgrid': False,
                'showline': False,
                'zeroline': False,
                'showticklabels': pa_["col_dendogram_dist"],
                'ticks': ""
            })

    fig.update_layout(template='plotly_white')

    fig.update_layout(
        title={
            "text": pa["title"],
            "yanchor": "top",
            "font": {
                "size": float(pa["title_size_value"])
            }
        })

    cols = list(fig['layout']['xaxis']['ticktext'])
    rows = list(fig['layout']['yaxis']['ticktext'])
    df_ = pd.DataFrame({"i": range(len(rows))}, index=rows)
    df_ = df_.sort_values(by=["i"], ascending=False)
    df_ = df_.drop(["i"], axis=1)
    df_ = pd.merge(df_, tmp, how="left", left_index=True, right_index=True)
    df_ = df_[cols]

    clusters_cols_ = pd.DataFrame({"col": cols})
    if pa_["col_cluster"]:
        clusters_cols = pd.merge(clusters_cols_,
                                 clusters_cols,
                                 on=["col"],
                                 how="left")
    else:
        clusters_cols = clusters_cols_

    clusters_rows_ = pd.DataFrame({"col": df_.index.tolist()})
    if pa_["row_cluster"]:
        clusters_rows = pd.merge(clusters_rows_,
                                 clusters_rows,
                                 on=["col"],
                                 how="left")
    else:
        clusters_rows = clusters_rows_

    df_.reset_index(inplace=True, drop=False)
    cols = df_.columns.tolist()
    cols[0] = "rows"
    df_.columns = cols

    return fig, clusters_cols, clusters_rows, df_
# print dendrogram
dend = dendrogram(linkage(distanceMatrix, method='complete'),
                  color_threshold=1,
                  leaf_font_size=10,
                  labels=df.teamID.tolist())
# This give us 7 clusters

# let's set the cutoff at 2 for 4 clusters
dend = dendrogram(linkage(distanceMatrix, method='complete'),
                  color_threshold=2,
                  leaf_font_size=10,
                  labels=df.teamID.tolist())

# get cluster assignments
assignments = fcluster(linkage(distanceMatrix, method='complete'), 2,
                       'distance')

cluster_output = pandas.DataFrame({
    'team': df.teamID.tolist(),
    'cluster': assignments
})

# change the colors of the graph
colors = cluster_output.cluster
colors[colors == 1] = 'b'
colors[colors == 2] = 'g'
colors[colors == 3] = 'r'
colors[colors == 4] = 'y'

# Plot
plt.scatter(df.total_salaries, df.total_wins, s=100, c=colors, lw=0)
Example #39
0
def hierarchical(data=None,
                 k=0,
                 linkage='average',
                 metric='euclidean',
                 metric_args=None):
    """Perform clustering using hierarchical agglomerative algorithms.

    Parameters
    ----------
    data : array
        An m by n array of m data samples in an n-dimensional space.
    k : int, optional
        Number of clusters to extract; if 0 uses the life-time criterion.
    linkage : str, optional
        Linkage criterion; one of 'average', 'centroid', 'complete', 'median',
        'single', 'ward', or 'weighted'.
    metric : str, optional
        Distance metric (see 'biosppy.metrics').
    metric_args : dict, optional
        Additional keyword arguments to pass to the distance function.

    Returns
    -------
    clusters : dict
        Dictionary with the sample indices (rows from 'data') for each found
        cluster; outliers have key -1; clusters are assigned integer keys
        starting at 0.

    Raises
    ------
    TypeError
        If 'metric' is not a string.
    ValueError
        When the 'linkage' is unknown.
    ValueError
        When 'metric' is not 'euclidean' when using 'centroid', 'median',
        or 'ward' linkage.
    ValueError
        When 'k' is larger than the number of data samples.

    """

    # check inputs
    if data is None:
        raise TypeError("Please specify input data.")

    if linkage not in [
            'average', 'centroid', 'complete', 'median', 'single', 'ward',
            'weighted'
    ]:
        raise ValueError("Unknown linkage criterion '%r'." % linkage)

    if not isinstance(metric, six.string_types):
        raise TypeError("Please specify the distance metric as a string.")

    N = len(data)
    if k > N:
        raise ValueError("Number of clusters 'k' is higher than the number" \
                          " of input samples.")

    if metric_args is None:
        metric_args = {}

    if linkage in ['centroid', 'median', 'ward']:
        if metric != 'euclidean':
            raise TypeError("Linkage '{}' requires the distance metric to be" \
                            " 'euclidean'.".format(linkage))
        Z = sch.linkage(data, method=linkage)
    else:
        # compute distances
        D = metrics.pdist(data, metric=metric, **metric_args)

        # build linkage
        Z = sch.linkage(D, method=linkage)

    if k < 0:
        k = 0

    # extract clusters
    if k == 0:
        # life-time
        labels = _life_time(Z, N)
    else:
        labels = sch.fcluster(Z, k, 'maxclust')

    # get cluster indices
    clusters = _extract_clusters(labels)

    return utils.ReturnTuple((clusters, ), ('clusters', ))
Example #40
0
def step5(max_d):
    global eventL, notCombineRDDL, resultEventL, resultRDDL, outputPath, specialNum

    #vectorize the text
    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=my_tokenizer,
                                 preprocessor=None,
                                 stop_words=['*'],
                                 max_features=10000)

    train_data_features = vectorizer.fit_transform(eventL)
    train_data_features = train_data_features.toarray()

    #hierarchical clustering
    Z = linkage(train_data_features, 'complete', 'cityblock')
    #c, coph_dists = cophenet(Z, pdist(train_data_features))
    #print 'The goodness of cluster result:', c

    clusters = fcluster(Z, max_d, criterion='distance')

    #initialize RDD list and Event list
    resultEventLL = []
    resultRDDLL = []
    numCombinedEvents = max(clusters)
    for i in range(numCombinedEvents):
        resultRDDLL.append([])
        resultEventLL.append([])

    #Put event/RDD that belong to the same cluster into the same list
    currentEventNum = 0
    for clusterNum in clusters:
        resultRDDLL[clusterNum - 1].append(notCombineRDDL[currentEventNum])
        resultEventLL[clusterNum - 1].append(eventL[currentEventNum])
        currentEventNum += 1

    #Merge the event/RDD in the same list
    for sameEventL in resultEventLL:

        if len(sameEventL) == 1:
            resultEventL.append(sameEventL[0])
        else:
            combinedEvent = sameEventL[0].strip().split()
            count = 0
            for currentEvent in sameEventL:
                if count == 0:
                    count += 1
                    continue
                else:
                    combinedEvent = LCS(combinedEvent,
                                        currentEvent.strip().split())
                    count += 1
            resultEventL.append(' '.join(combinedEvent))

    for sameRDDL in resultRDDLL:
        if len(sameRDDL) == 1:
            resultRDDL.append(sameRDDL[0])
        else:
            resultRDDL.append(sc.union(sameRDDL))

        resultRDDL[-1].map(lambda (ID, log): ID).saveAsTextFile(
            outputPath + str(len(resultRDDL) + specialNum))
Example #41
0
def cluster_ssh(sla, lat, lon, nclusters, distthres=3000, returnall=False):
    # Remove All NaN Points
    ntime, nlat, nlon = sla.shape
    slars = sla.reshape(ntime, nlat * nlon)
    okdata, knan, okpts = proc.find_nan(slars, 0)
    npts = okdata.shape[1]

    # ---------------------------------------------
    # Calculate Correlation and Covariance Matrices
    # ---------------------------------------------
    srho = np.corrcoef(okdata.T, okdata.T)
    scov = np.cov(okdata.T, okdata.T)
    srho = srho[:npts, :npts]
    scov = scov[:npts, :npts]

    # --------------------------
    # Calculate Distance Matrix
    # --------------------------
    lonmesh, latmesh = np.meshgrid(lon, lat)
    coords = np.vstack([lonmesh.flatten(), latmesh.flatten()]).T
    coords = coords[okpts, :]
    coords1 = coords.copy()
    coords2 = np.zeros(coords1.shape)
    coords2[:, 0] = np.radians(coords1[:, 1])  # First point is latitude
    coords2[:, 1] = np.radians(coords1[:, 0])  # Second Point is Longitude
    sdist = haversine_distances(coords2, coords2) * 6371

    # --------------------------
    # Combine the Matrices
    # --------------------------
    a_fac = np.sqrt(
        -distthres /
        (2 * np.log(0.5)))  # Calcuate so exp=0.5 when distance is 3000km
    expterm = np.exp(-sdist / (2 * a_fac**2))
    distance_matrix = 1 - expterm * srho

    # --------------------------
    # Do Clustering (scipy)
    # --------------------------
    cdist = squareform(distance_matrix, checks=False)
    linked = linkage(cdist, 'weighted')
    clusterout = fcluster(linked, nclusters, criterion='maxclust')

    # -------------------------
    # Calculate the uncertainty
    # -------------------------
    uncertout = np.zeros(clusterout.shape)
    for i in range(len(clusterout)):
        covpt = scov[i, :]  #
        cid = clusterout[i]  #
        covin = covpt[np.where(clusterout == cid)]
        covout = covpt[np.where(clusterout != cid)]
        uncertout[i] = np.mean(covin) / np.mean(covout)

    # Apply rules from Thompson and Merrifield (Do this later)
    # if uncert > 2, set to 2
    # if uncert <0.5, set to 0
    #uncertout[uncertout>2]   = 2
    #uncertout[uncertout<0.5] = 0

    # -----------------------
    # Replace into full array
    # -----------------------
    clustered = np.zeros(nlat * nlon) * np.nan
    clustered[okpts] = clusterout
    clustered = clustered.reshape(nlat, nlon)
    cluster_count = []
    for i in range(nclusters):
        cid = i + 1
        cnt = (clustered == cid).sum()
        cluster_count.append(cnt)
        print("Found %i points in cluster %i" % (cnt, cid))
    uncert = np.zeros(nlat * nlon) * np.nan
    uncert[okpts] = uncertout
    uncert = uncert.reshape(nlat, nlon)

    if returnall:
        return clustered, uncert, cluster_count, srho, scov, sdist, distance_matrix
    return clustered, uncert, cluster_count
Example #42
0
    # Hierarchical Clustering
    Y = sch.linkage(matrix, method=m)
    
    # Cut-off
    n_ = []
    
    # Number of clusters we want to obtain
    cluster_size = 4
    # We try different thresholds
    cutoff_range = np.linspace(Y[:,2].max()/2., Y[:,2].min(), 50)
    is_csize_reached = False
    
    for t in cutoff_range:
        # We cutoff the dendrogram using threshold t, obtaining labels
        cl = sch.fcluster(Y, t, 'distance') 
        # No. of clusters
        n_cl = np.unique(cl)[-1]
        
        # If our cluster number is reached we save the labels
        if (n_cl >= cluster_size) and (is_csize_reached == False):
            is_csize_reached = True
            t_color = t
            cluster_labels.append(cl)
        
        # n_ mantains the no. of clusters for each threshold
        n_.append(n_cl)
    
    # if cluster_size is not reached we save last clustering
    if is_csize_reached == False:
        t_color = t
Example #43
0
def _hclust(linkmat, nclusters):
    threshold = _get_threshold(linkmat, nclusters)
    t = fcluster(linkmat, threshold, criterion='distance')
    return Partition(t)
Example #44
0
def visualize_heatmap(topic_model,
                      topics: List[int] = None,
                      top_n_topics: int = None,
                      n_clusters: int = None,
                      width: int = 800,
                      height: int = 800) -> go.Figure:
    """ Visualize a heatmap of the topic's similarity matrix

    Based on the cosine similarity matrix between topic embeddings,
    a heatmap is created showing the similarity between topics.

    Arguments:
        topic_model: A fitted BERTopic instance.
        topics: A selection of topics to visualize.
        top_n_topics: Only select the top n most frequent topics.
        n_clusters: Create n clusters and order the similarity
                    matrix by those clusters.
        width: The width of the figure.
        height: The height of the figure.

    Returns:
        fig: A plotly figure

    Usage:

    To visualize the similarity matrix of
    topics simply run:

    ```python
    topic_model.visualize_heatmap()
    ```

    Or if you want to save the resulting figure:

    ```python
    fig = topic_model.visualize_heatmap()
    fig.write_html("path/to/file.html")
    ```
    """

    # Select topic embeddings
    if topic_model.topic_embeddings is not None:
        embeddings = np.array(topic_model.topic_embeddings)
    else:
        embeddings = topic_model.c_tf_idf

    # Select topics based on top_n and topics args
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
    else:
        topics = sorted(list(topic_model.get_topics().keys()))

    # Order heatmap by similar clusters of topics
    if n_clusters:
        if n_clusters >= len(set(topics)):
            raise ValueError("Make sure to set `n_clusters` lower than "
                             "the total number of unique topics.")

        embeddings = embeddings[[topic + 1 for topic in topics]]
        distance_matrix = cosine_similarity(embeddings)
        Z = linkage(distance_matrix, 'ward')
        clusters = fcluster(Z, t=n_clusters, criterion='maxclust')

        # Extract new order of topics
        mapping = {cluster: [] for cluster in clusters}
        for topic, cluster in zip(topics, clusters):
            mapping[cluster].append(topic)
        mapping = [cluster for cluster in mapping.values()]
        sorted_topics = [topic for cluster in mapping for topic in cluster]
    else:
        sorted_topics = topics

    # Select embeddings
    indices = np.array([topics.index(topic) for topic in sorted_topics])
    embeddings = embeddings[indices]
    distance_matrix = cosine_similarity(embeddings)

    # Create nicer labels
    new_labels = [[[str(topic), None]] + topic_model.get_topic(topic) for topic in sorted_topics]
    new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels]
    new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels]

    fig = px.imshow(distance_matrix,
                    labels=dict(color="Similarity Score"),
                    x=new_labels,
                    y=new_labels,
                    color_continuous_scale='GnBu'
                    )

    fig.update_layout(
        title={
            'text': "<b>Similarity Matrix",
            'y': .95,
            'x': 0.55,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
    )
    fig.update_layout(showlegend=True)
    fig.update_layout(legend_title_text='Trend')

    return fig
Example #45
0
def evaluate_distance_matrix(distanceMatrix, trueClusters, clusteringType,
                             **kwargs):

    # TODO: 1. clear blackList dependency
    #       2. clustering type is an unlucky name for betaCV and the like.

    trueClusterNum = len(np.unique(trueClusters))
    #     distanceMatrixCopy = np.copy(distanceMatrix)

    if clusteringType == 'all' or 'betaCV' in clusteringType:
        res = beta_cv(distanceMatrix,
                      trueClusters,
                      blackList=None,
                      ranks=False)
        print "Beta-CV = %f" % (res, )

    if clusteringType == 'all' or 'cIndex' in clusteringType:
        res = c_index(distanceMatrix, trueClusters, blackList=None)
        print "C-Index = %f" % (res, )

    if clusteringType == 'all' or 'silhouette' in clusteringType:
        print "Silhouette = %f" % (metrics.silhouette_score(
            distanceMatrix, trueClusters, metric='precomputed'), )

    if clusteringType == 'all' or 'hierarchical' in clusteringType:
        print "\nEvaluating **Hierarchical Clustering**"
        distArray = ssd.squareform(distanceMatrix)
        try:
            linkageFunction = kwargs['linkage']
        except:
            linkageFunction = "complete"

        print "Linkage = " + linkageFunction
        Z = hierarchy.linkage(distArray, method=linkageFunction)
        T = hierarchy.fcluster(Z, trueClusterNum, criterion="maxclust")
        if len(np.unique(T)) != trueClusterNum:
            print "!Clusters found: " + str(len(np.unique(T)))

        res = evaluate_unsup_clustering(trueClusters, T, None, verbose=True)

    if clusteringType == 'all' or 'affinity' in clusteringType:
        print "\nEvaluating **Affinity Propagation**"
        affinities = np.exp(-(distanceMatrix**2) /
                            (2 * (np.median(distanceMatrix)**2)))
        cluster_centers_indices, labels = sklearn_cluster.affinity_propagation(
            affinities, copy=False, verbose=True)
        res = evaluate_unsup_clustering(trueClusters,
                                        labels,
                                        len(cluster_centers_indices),
                                        verbose=True)

    if clusteringType == 'all' or "dbscan" in clusteringType:
        print "\nEvaluating **DBScan Clustering**"
        # TODO maybe adapt eps
        eps = np.percentile(distanceMatrix, 5)
        predictedLabels = sklearn_cluster.DBSCAN(
            eps, metric='precomputed').fit_predict(distanceMatrix)
        print "Predicted as Noise: " + str(np.sum(predictedLabels == -1))
        res = evaluate_unsup_clustering(trueClusters,
                                        predictedLabels,
                                        len(np.unique(predictedLabels)),
                                        verbose=True)

    if clusteringType == 'all' or "spectral" in clusteringType:
        print "\nEvaluating **Spectral (with Normalized Laplacian) Clustering**"
        affinities = np.exp(-(distanceMatrix**2) /
                            (2 * (np.median(distanceMatrix)**2)))
        # arpack was chosen for stability reasons.
        classifier = sklearn_cluster.SpectralClustering(
            n_clusters=trueClusterNum,
            affinity='precomputed',
            assign_labels='kmeans',
            eigen_solver='arpack')
        classifier.fit(affinities)
        res = evaluate_unsup_clustering(trueClusters,
                                        classifier.labels_,
                                        None,
                                        verbose=True)

#     assert(np.all(distanceMatrixCopy == distanceMatrix))
    return res
Example #46
0
def compare_methods(corr_matrix=None,
                    beta=None,
                    Q=None,
                    p=None,
                    n=500,
                    q=0.25,
                    num_data_samples=10,
                    link_methods=['average'],
                    S_methods=None,
                    split=True,
                    sample_kwargs={
                        'coeff_size': 10,
                    },
                    feature_fns={'LCD': lasso_statistic},
                    feature_fn_kwargs={},
                    S_kwargs={
                        'objective': 'norm',
                        'norm_type': 'fro'
                    },
                    copies=1,
                    seed=110,
                    reduction=None,
                    time0=None,
                    scache_only=False,
                    num_processes=8,
                    compute_split_oracles=True,
                    noSDPcalc=False,
                    onlyoracles=False):
    """ 
    S_methods arg optionally allows you to add extra kwargs (e.g. ASDP instead of SDP)
    for each link method. Should be a list of tuples, of the form
    [(methodname, method_kwargs)], and it should be the same length as 
    link_methods.
    scache_only: If True, only compute the S_group matrices, then stop.
    noSDP: If True, don't compute any SDP formulations.
    """

    # Timing
    if time0 is None:
        time0 = time.time()

    # Get p, Q, reduction
    if corr_matrix is not None:
        p = corr_matrix.shape[0]
    if Q is None and corr_matrix is not None:
        Q = knockadapt.utilities.chol2inv(corr_matrix)
    if reduction is None:
        reduction = 10

    # Sample data for the first time, create links.
    # We set set the seed here for two reasons:
    # (1) This X and y are not actually used for anything
    # (2) In the case where we are generating the corr_matrix,
    # we want this to be reproducible.
    if seed is not None:
        np.random.seed(seed)
    X, y, beta2, Q2, corr_matrix2 = knockadapt.graphs.sample_data(
        n=n, p=p, corr_matrix=corr_matrix, Q=Q, beta=beta, **sample_kwargs)

    # Make sure we aren't changing the DGP
    if corr_matrix is None:
        corr_matrix = corr_matrix2
    if beta is None:
        beta = beta2
    if Q is None:
        Q = Q2
    test_DGP_consistency(beta, beta2, corr_matrix, corr_matrix2, Q, Q2)

    # Sometimes the link methods are the same because we're also comparing
    # S generation methods (e.g. ASDP vs SDP), so might have to rename them
    link_method_dict = {}
    if S_methods is not None:
        for i in range(len(link_methods)):
            methodname = S_methods[i][0]
            oldname = link_methods[i]
            new_name = methodname + "_" + oldname
            link_methods[i] = new_name
            link_method_dict[new_name] = oldname

    # Create links, groups, cutoffs
    links = {
        link_method: knockadapt.graphs.create_correlation_tree(
            corr_matrix, method=link_method_dict[link_method])
        for link_method in link_methods
    }

    # Dictionary storing cutoff lists for each link method
    all_cutoffs = {}
    for link_method in link_methods:
        link = links[link_method]
        # Max size refers to maximum group size
        cutoffs = knockadapt.adaptive.create_cutoffs(link=link,
                                                     reduction=reduction,
                                                     max_size=100)
        all_cutoffs[link_method] = cutoffs

    # Dictionary of dictionaries (link by cutoff) which stores group sizes
    all_Ms = {}
    # Dictionary of dictionaries (link by cutoff) which stores groupings
    all_groups = {}
    for link_method in link_methods:
        # Graph cutoffs, links
        cutoffs = all_cutoffs[link_method]
        link = links[link_method]

        # Create groups for each cutoff
        link_groups = {}
        Ms = {}
        for cutoff in cutoffs:
            groups = hierarchy.fcluster(link, cutoff, criterion="distance")
            link_groups[cutoff] = groups
            Ms[cutoff] = np.unique(groups).shape[0]

        # Add smaller dictionaries to parent dictionaries
        all_groups[link_method] = link_groups
        all_Ms[link_method] = Ms

    # Create S matrices: dictionary of dictionaries (link by cutoff)
    # This is a bit hacky, but we can associate a different S_method
    # with each link method if we want.
    if S_methods is None:
        S_methods = [{} for _ in link_methods]

    S_matrixes = {link_method: {} for link_method in link_methods}

    # Assemble the list of parameters to pass to the multiprocessing module
    all_arguments = []
    for link_method, S_method in zip(link_methods, S_methods):

        # Retrive groups/cutoffs for this link method
        link_method_groups = all_groups[link_method]
        cutoffs = all_cutoffs[link_method].copy()

        # Progress report
        sys.stdout.write(
            f'Generating/retreiving S matrices for {link_method} now, time is {time.time() - time0}\n'
        )

        # Add S matrixes
        for cutoff in cutoffs:
            groups = link_method_groups[cutoff]

            # Possibly load from text file
            S_group = load_S_matrix(p, seed, cutoff, link_method,
                                    sample_kwargs)

            if S_group is not None:
                sys.stdout.write(
                    f'S for {link_method} {np.around(cutoff, 3)} is preloaded, time is {time.time() - time0}\n'
                )
                S_matrixes[link_method][cutoff] = S_group
            else:

                # If noSDP, don't bother to compute,
                # just get rid of the particular cutoff
                if noSDPcalc:

                    # Only remove SDP operations (expensive)
                    if S_method[0] == 'SDP':
                        remove_flag = True
                    else:
                        remove_flag = False

                    # Then remove
                    if remove_flag:
                        # Delete cutoff from cutoffs, start by
                        # making report
                        time1 = time.time() - time0
                        sys.stdout.write(
                            f'Cutoff {np.around(cutoff, 3)} for {link_method} is being removed since noSDPcalc = True, time is {time1}\n'
                        )

                        # Now actually delete
                        which_to_delete = np.where(
                            all_cutoffs[link_method] == cutoff)
                        all_cutoffs[link_method] = np.delete(
                            arr=all_cutoffs[link_method],
                            obj=which_to_delete,
                            axis=0)

                        # And don't add arguments
                        continue

                # If it hasn't been removed, add this to
                # list of arguments to pass to pool
                all_arguments.append(
                    (S_group, link_method, cutoff, time0, X, corr_matrix, Q,
                     groups, S_kwargs, S_method, p, seed, sample_kwargs))

    # Pass to multiprocessor
    if num_processes == 1:
        all_S_outputs = []
        for arguments in all_arguments:
            all_S_outputs.append(compute_S_matrix(*arguments))
    else:
        with Pool(num_processes) as thepool:
            all_S_outputs = thepool.starmap(compute_S_matrix, all_arguments)

    for (S_group, link_method, cutoff) in all_S_outputs:
        S_matrixes[link_method][cutoff] = S_group

    if scache_only:
        sys.stdout.write(
            f'Terminating early because scache_only is true, time is {time.time() - time0} \n'
        )
        return None

    # Construct oracle (curse of dimensionality applies here)
    feature_methods = [fname for fname in feature_fns]
    for fname in feature_methods:
        if fname not in feature_fn_kwargs:
            feature_fn_kwargs[fname] = {}
    oracle_results = pd.DataFrame(columns=ORACLE_COLUMNS)

    # Helper function which will be used for multiprocessing ----------------------
    partial_eval_oracles = partial(eval_oracles,
                                   n=n,
                                   p=p,
                                   q=q,
                                   X=X,
                                   y=y,
                                   corr_matrix=corr_matrix,
                                   Q=Q,
                                   beta=beta,
                                   sample_kwargs=sample_kwargs,
                                   link_methods=link_methods,
                                   feature_fns=feature_fns,
                                   feature_fn_kwargs=feature_fn_kwargs,
                                   all_cutoffs=all_cutoffs,
                                   all_groups=all_groups,
                                   S_matrixes=S_matrixes,
                                   time0=time0,
                                   copies=copies,
                                   compute_split_oracles=compute_split_oracles)

    # End helper function ---------------------------
    sys.stdout.write("Picking the best oracles!\n")

    # Don't use the pool object if n-processes is 1
    if num_processes == 1:
        all_outputs_to_add = []
        for j in range(num_data_samples):
            all_outputs_to_add.append(partial_eval_oracles(j))
    else:
        with Pool(num_processes) as thepool:
            all_outputs_to_add = thepool.map(partial_eval_oracles,
                                             list(range(num_data_samples)))

    # Put it all together
    for process_output in all_outputs_to_add:
        for to_add in process_output:
            oracle_results = oracle_results.append(to_add)

    # Pick best cutoffs based on mean power for each oracle
    all_oracle_cutoffs = {}
    for oracle_type in oracle_results['oracle_type'].unique():

        # Create subset, calculate means
        subset_results = oracle_results.loc[oracle_results['oracle_type'] ==
                                            oracle_type]
        mean_powers = subset_results.groupby(
            ['feature_fn', 'link_method', 'cutoff'])['power'].mean()

        # Take max and save
        oracle_cutoffs = mean_powers.unstack().idxmax(1).unstack()
        all_oracle_cutoffs[oracle_type] = oracle_cutoffs

    sys.stdout.write(
        f'Finished creating oracles: comparing methods, time is {time.time() - time0}\n'
    )

    if onlyoracles:
        sys.stdout.write(
            f'Returning early because onlyoracles is true (not doing more computation)\n'
        )
        return None, oracle_results, S_matrixes

    # Initialize output to actually compare methods
    output_df = pd.DataFrame(columns=FINAL_COLUMNS)

    # Create helper function for multiprocessing
    partial_one_sample_comparison = partial(
        one_sample_comparison,
        n=n,
        p=p,
        q=q,
        X=X,
        y=y,
        corr_matrix=corr_matrix,
        Q=Q,
        beta=beta,
        sample_kwargs=sample_kwargs,
        links=links,
        all_oracle_cutoffs=all_oracle_cutoffs,
        link_methods=link_methods,
        feature_fns=feature_fns,
        feature_fn_kwargs=feature_fn_kwargs,
        all_cutoffs=all_cutoffs,
        all_groups=all_groups,
        S_matrixes=S_matrixes,
        time0=time0,
        copies=copies,
        reduction=reduction)

    # Don't use pool object if num_processes == 1
    if num_processes == 1:
        comparisons_to_add = []
        for j in range(num_processes):
            comparisons_to_add.append(partial_one_sample_comparison(j))
    else:
        with Pool(num_processes) as thepool:
            comparisons_to_add = thepool.map(partial_one_sample_comparison,
                                             list(range(num_data_samples)))

    sys.stdout.write(
        f'Finished: now just combining outputs, time is {time.time() - time0}\n'
    )

    # Combine outputs
    for list_to_add in comparisons_to_add:
        for to_add in list_to_add:
            output_df = output_df.append(to_add, ignore_index=True)

    return output_df, oracle_results, S_matrixes
    return (agree_same + disagree_same) / float(count)


# Code Sample
import scipy.cluster.hierarchy as sch
import numpy as np
import pylab as pl

# Plot dendogram and cut the tree to find resulting clusters
fig = pl.figure()
data = np.array([[1, 2, 3], [1, 1, 1], [5, 5, 5]])
datalable = ['first', 'second', 'third']
hClsMat = sch.linkage(data, method='complete')  # Complete clustering
sch.dendrogram(hClsMat, labels=datalable, leaf_rotation=45)
fig.savefig("thing.pdf")
resultingClusters = sch.fcluster(hClsMat, t=3, criterion='distance')
print resultingClusters

# Your code starts from here ....

# 1.
# Scaling min max
# STUDENT CODE TODO

# 2.
# K-means http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# STUDENT CODE TODO

# 3.
# Compute Rand Index
# STUDENT CODE TODO
'''We are going to continue the investigation into the sightings of legendary Pokémon from the previous exercise. Remember that in the scatter plot of the previous exercise, you identified two areas where Pokémon sightings were dense. This means that the points seem to separate into two clusters. In this exercise, you will form two clusters of the sightings using hierarchical clustering.

'x' and 'y' are columns of X and Y coordinates of the locations of sightings, stored in a Pandas data frame, df. The following are available for use: matplotlib.pyplot as plt, seaborn as sns, and pandas as pd.'''

import pandas as pd

x = [9, 6, 2, 3, 1, 7, 1, 6, 1, 7, 23, 26, 25, 23, 21, 23, 23, 20, 30, 23]
y = [8, 4, 10, 6, 0, 4, 10, 10, 6, 1, 29, 25, 30, 29, 29, 30, 25, 27, 26, 30]


df = pd.DataFrame({'x':x,'y':y})

# Import linkage and fcluster functions
from scipy.cluster.hierarchy import linkage, fcluster

# Use the linkage() function to compute distances
Z = linkage(df, 'ward')

# Generate cluster labels
df['cluster_labels'] = fcluster(Z, 2, criterion='maxclust')

# Plot the points with seaborn
import matplotlib.pyplot as plt
import seaborn as sns
sns.scatterplot(x='x', y='y', hue='cluster_labels', data=df)
plt.show()
Example #49
0
plt.rcParams['font.size'] = 14  #フォントサイズを設定

dendrogram(result, labels=namelist)
plt.ylabel("distance")
#plt.show()
#plt.savefig("/home/kei/document/experiments/Master/UJ_result/elder.png")
plt.cla()
NUM_CLUSTERS_RANGE = range(2, 24)
silhouette_coefficient = []
davies_bouldin_index = []
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Coefficient')
plt.rcParams["ytick.direction"] = "in"
plt.rcParams["xtick.direction"] = "in"
for num in NUM_CLUSTERS_RANGE:
    labels = fcluster(result, t=num, criterion='maxclust')
    silhouette_coefficient.append(
        silhouette_score(Distance, labels, metric='precomputed'))
    davies_bouldin_index.append(davies_bouldin_score(Distance, labels))
p0, = plt.plot(NUM_CLUSTERS_RANGE,
               silhouette_coefficient,
               'bo-',
               label='Silhouette Coefficient')
#p2, = par2.plot(NUM_CLUSTERS_RANGE, davies_bouldin_index, 'gs-', label='Davies Bouldin Index')

#par2.set_ylabel('Davies Bouldin Index')
lines = [p0]
"""
plt.legend(lines,
            [l.get_label() for l in lines],
            fontsize=10,
Example #50
0
def fastlinkage(dmat):
    return sch.fcluster(fc.linkage(squareform(dmat),method='single'),0.01)
Example #51
0
# In[66]:


den.keys()#dict_keys(['icoord', 'dcoord', 'ivl', 'leaves', 'color_list'])
len(den['ivl'])
#den['ivl']
#den['leaves']


# In[67]:


from scipy.cluster.hierarchy import fcluster

assignments = fcluster(linked,max_d,'distance')

print(assignments)

assignments_series = pd.Series(assignments)

assignments_series.value_counts()


# ## Hierarchical Clustering - dendrogram - Cluster

# In[68]:


leaves_dataframe = pd.DataFrame({"leaves":den['leaves']})
assignments_dataframe = pd.DataFrame({"assignments":assignments})
Example #52
0
def scipyLinkage(dmat):
    return sch.fcluster(sch.single(dmat),0.01)
Example #53
0
    samples = order.sort_values('x')['Sample ID'].tolist()

    ax1.yaxis.set_visible(False)
    ax1.xaxis.set_visible(False)
    ax1.tick_params(left=False, bottom=False)
    ax1.spines['left'].set_visible(False)
    ax1.spines['bottom'].set_visible(False)

    # =============================================================================
    # Plot clusters
    # =============================================================================

    cmap = plt.get_cmap('tab10')

    k = len(list(set(dn_data.get('color_list'))))
    T = fcluster(ln, k, 'maxclust')

    # calculate labels
    for index, row in order.iterrows():
        order.at[index, 'cluster'] = T[row['i']]
        order.at[index,
                 'cluster_color'] = matplotlib.colors.to_hex(cmap(T[row['i']]))

    order = order.merge(tc[['Sample ID', 'Final tNGS_TC', 'tc_color']],
                        on='Sample ID')

    ax2.bar(order['x'], 0.67, bottom=0.33, color=order['cluster_color'])

    ax2.set_xlim(-0.5, len(samples) - 0.5)
    ax2.set_yticks([0.66])
    ax2.set_ylim(0, 1)
Example #54
0
def fcLinkage(dmat):
    return sch.fcluster(fc.linkage(dmat,method='single'),0.01)
Example #55
0
def hierarchical_clustering(filters, threshold=1.0):
    Dist = distance_matrix(filters)
    Z = hc.linkage(Dist, method='complete')
    clusters = hc.fcluster(Z, t=threshold, criterion='distance')
    return clusters, Dist
Example #56
0
        consommation.iloc[:, i], consommation.iloc[:, j], k)
DM_GCC = pd.DataFrame(DM_GCC,
                      index=consommation.columns,
                      columns=consommation.columns)

# sns.clustermap(consommation, col_linkage=hcl.linkage(squareform(DM_GCC)))
plt.figure()
hcl.dendrogram(hcl.linkage(squareform(DM_GCC), method="average"))

plt.figure()
plt.plot(
    np.arange(.1, 1.1, .1),
    np.array([
        np.unique(
            hcl.fcluster(hcl.linkage(squareform(DM_GCC), method="average"),
                         t=t,
                         criterion="distance")).shape[0]
        for t in np.arange(0.1, 1.1, 0.1)
    ]))

hcl.fcluster(hcl.linkage(squareform(DM_GCC), method="average"),
             t=0.4,
             criterion="distance")
n_clusters = 5
clusters = hcl.fcluster(hcl.linkage(squareform(DM_GCC), method="average"),
                        t=n_clusters,
                        criterion="maxclust")

from sklearn.decomposition import PCA

pca = PCA(n_components=4)
Example #57
0
	def _cluster_by_distance(linkage_table: numpy.ndarray, cutoff: float) -> numpy.ndarray:
		""" Try to infer a good distance cutoff by detecting the first changepoint in the sorted array of distances."""
		clusters = hierarchy.fcluster(linkage_table, t = cutoff, criterion = 'distance')
		return clusters
Example #58
0
def plot_arr_dendrogram(abs_corr_array,
                        names,
                        max_dist_cluster,
                        measures=None):
    """
    Compute  dendrogram and create a plot plotting dendrogram and abs_corr_array
    Parameters:
    ----------
    abs_corr_array : ndarray
        array containing the correlation matrix
    names : list 
        list of strings containing the names of the operations in abs_corr_array in the
        corresponding order.
    max_dist_cluster : float
        Maximum distance in the clusters
    measures : ndarray (n_measures x abs_corr_array.shape[0])
        Array containing measures to be plotted on top of the matrix. Positions corresponding positions
        of operations in abs_corr_array.
    Returns:
    --------
    index : list
        list of indices used to reorder the correlation matrix
    """

    figsize = (18, 12)
    #figsize=(46.81,33.11)
    rect_measures = [0.25, 0.8075, 0.5, 0.15]
    rect_dendro = [0.755, 0.05, 0.15, 0.75]
    rect_matrix = [0.25, 0.05, 0.5, 0.75]
    rect_color = [0.92, 0.05, 0.02, 0.75]

    # Compute and plot dendrogram.
    fig = plt.figure(figsize=figsize)
    axdendro = fig.add_axes(rect_dendro)
    corr_linkage = idtop.calc_linkage(abs_corr_array)[0]

    corr_dendrogram = hierarchy.dendrogram(corr_linkage,
                                           orientation='left',
                                           color_threshold=max_dist_cluster)
    #axdendro.set_xticks([])
    axdendro.set_yticks([])
    axdendro.axvline(max_dist_cluster, ls='--', c='k')
    axdendro.set_xlabel('correlation distance')
    # Plot distance matrix.
    axmatrix = fig.add_axes(rect_matrix)
    index = corr_dendrogram['leaves']
    abs_corr_array = abs_corr_array[index, :]
    abs_corr_array = abs_corr_array[:, index]

    # -- plot the correlation matrix
    vmin = round(np.min(abs_corr_array), 1)
    vmax = 1
    numSteps = (vmax - vmin) * 20  # steps of 0.05 in correlation
    im = axmatrix.matshow(abs_corr_array,
                          aspect='auto',
                          origin='lower',
                          vmin=vmin,
                          vmax=vmax,
                          cmap=mpl.pyplot.cm.get_cmap('jet', numSteps))

    axmatrix.set_xticks([])
    axmatrix.set_yticks(range(len(index)))
    #axmatrix.set_yticklabels(np.array(names)[index],fontsize=5)
    axmatrix.set_yticklabels(np.array(names)[index])

    # Plot colorbar.
    axcolor = fig.add_axes(rect_color)
    cbar = plt.colorbar(im, cax=axcolor)
    cbar.set_label('Pearson correlation')

    # Plot the quality measures
    axmeasure = fig.add_axes(rect_measures)
    axmeasure.xaxis.set_ticklabels([])
    axmeasure.scatter(
        np.arange(0, measures.shape[-1]) + 0.5, measures[0, index])
    axmeasure.set_xlim([0, measures.shape[-1]])
    axmeasure.set_ylabel('problems calculated')
    axmeasure.yaxis.label.set_color('b')
    [label.set_color('b') for label in axmeasure.get_yticklabels()]
    axmeasure2 = axmeasure.twinx()
    axmeasure2.plot(np.arange(0, measures.shape[-1]) + 0.5,
                    measures[1, index],
                    color='r')
    axmeasure2.set_xlim([0, measures.shape[-1]])

    [label.set_color('r') for label in axmeasure2.get_yticklabels()]
    axmeasure2.set_ylabel('z-scored avg classification error')
    axmeasure2.yaxis.label.set_color('r')

    # -----------------------------------------------------------------
    # -- calculate and plot clusters ----------------------------------
    # -----------------------------------------------------------------
    #cluster_ind = hierarchy.fcluster(link_arr, t=cluster_t, criterion=cluster_criterion)
    cluster_ind = hierarchy.fcluster(corr_linkage,
                                     t=max_dist_cluster,
                                     criterion='distance')

    # -- plot delimiters for measures
    cluster_bounds = np.hstack((-1, np.nonzero(np.diff(
        cluster_ind[index]))[0], abs_corr_array.shape[0] - 1)) + 1
    for bound in cluster_bounds:
        axmeasure.axvline(bound, linestyle='--', color='k')

    # -- calculate the locations for the cluster squares
    patch_bounds = cluster_bounds - .5
    patch_sizes = np.diff(patch_bounds)
    cluter_square_params = tuple(
        ((patch_bounds[i], patch_bounds[i]), patch_sizes[i], patch_sizes[i])
        for i in range(len(patch_sizes)))
    for cluster_square_param in cluter_square_params:
        axmatrix.add_patch(
            mpl.patches.Rectangle(cluster_square_param[0],
                                  cluster_square_param[1],
                                  cluster_square_param[2],
                                  fill=0,
                                  ec='w',
                                  lw=2))

    # -----------------------------------------------------------------
    # -- calculate and plot best features -----------------------------
    # -----------------------------------------------------------------
    best_features_marker = []
    for (i, j) in zip(cluster_bounds[:-1], cluster_bounds[1:]):
        measures_dendr = measures[1, index]
        best_features_marker.append(i + np.argmin(measures_dendr[i:j]))

    axmatrix.scatter(best_features_marker, best_features_marker, color='w')
    axmatrix.set_xlim([-0.5, abs_corr_array.shape[0] - 0.5])
    axmatrix.set_ylim([-0.5, abs_corr_array.shape[0] - 0.5])

    [(text.set_color('k'), text.set_weight('bold'))
     for i, text in enumerate(axmatrix.get_yticklabels())
     if i in best_features_marker]

    return index
Example #59
0
def heatmap(adata,
            pathway_genes,
            num_clust,
            name,
            norm=False,
            leg_axes=(1.3, 1.3),
            leg_cols=1):
    '''We group the leiden clusters based on similarity of expression of 
    specific genes in a pathway. 
    
    Arguments :
    
    adata : the AnnData gene expression matrix
    pathway_genes : a list of the genes in the pathway
    num_clust : the optimal number of clusters based on silhouette score on
    cosine distance
    name : the name with which we want to label the clusters of this pathway
    leg_axes : we can change the coordinates of the legend
    
    Returns: 
    
    AnnData object labeled with the pathway clusters.
    Return at Index 0: Clustermap Figure you can later save
    Return at Index 1: A dataframe of all the gene expression values
    '''
    if norm:
        df = gene_expression_norm(adata, pathway_genes)
    else:
        df = gene_expression(adata, pathway_genes)
    d = sch.distance.pdist(df.transpose(), metric='cosine')
    L = sch.linkage(d)
    linkage = sch.fcluster(L, num_clust, 'maxclust')
    str_linkage = []
    for i in linkage:
        str_linkage.append(str(i))
    new_dict = dict(
        zip([str(i) for i in range(0, len(str_linkage))], str_linkage))
    adata.obs[name] = adata.obs['leiden'].replace(new_dict)
    if norm:
        df = gene_expression_norm(adata, pathway_genes)
    else:
        df = gene_expression(adata, pathway_genes)
    cols = {}
    for j in list(df.columns):
        cols[j] = str(adata[adata.obs['leiden'] == j].obs[name][0])
    cols = pd.Series(data=cols, name='Clusters')
    labels = adata.obs[name].unique()
    labels = list(map(str, labels))
    cmap = plt.get_cmap('Paired')
    colors = cmap(np.linspace(0, 1, len(labels)))
    lut1 = dict(zip(labels, colors))
    cols_to_return = []
    keys_for_colors = list(lut1.keys())
    keys_for_colors.sort()
    for k in keys_for_colors:
        cols_to_return.append(lut1[k])
    adata.uns[name + '_colors'] = cols_to_return
    row_colors1 = cols.map(lut1)
    g = sns.clustermap(df,
                       metric='cosine',
                       row_cluster=False,
                       cmap='viridis',
                       col_linkage=L,
                       col_colors=row_colors1,
                       figsize=(6, 6))
    ax = g.ax_heatmap
    legend_elements = []
    keys = list(lut1.keys())
    keys.sort()
    for j in keys:
        legend_elements.append(
            Line2D([0], [0], marker='s', label=j, color=lut1[j]))
    #ax.legend(handles = legend_elements, title = 'Clusters', fontsize='small',
    #loc='upper right', bbox_to_anchor=(1.4,1.3), ncol=leg_cols)

    g.fig.suptitle((name + ' with ' + str(num_clust) + ' clusters'),
                   y=1.0,
                   x=0.5,
                   fontsize='large')
    ax.set_xlabel('Leiden clustering', x=0.5)
    return g.fig, df
Example #60
0
	def _cluster_by_inconsistent(linkage_table: numpy.ndarray, cutoff: float, inconsistent: pandas.DataFrame) -> numpy.ndarray:
		clusters = hierarchy.fcluster(linkage_table, t = cutoff, criterion = 'inconsistent', R = inconsistent.values)
		return clusters