Beispiel #1
0
def elbow_clustering_analysis():
    institution_info,X = readData()
    X=np.array(X)
    KK=range(1,20)
    KM = [kmeans(X,k) for k in KK]
    centroids = [cent for (cent,var) in KM]
    D_k = [cdist(X, cent, 'euclidean') for cent in centroids]
    cIdx = [np.argmin(D,axis=1) for D in D_k]
    dist = [np.min(D,axis=1) for D in D_k]
    tot_withinss = [sum(d**2) for d in dist]  # Total within-cluster sum of squares
    totss = sum(pdist(X)**2)/X.shape[0]       # The total sum of squares
    betweenss = totss - tot_withinss          # The between-cluster sum of squares
    kIdx = 3        # K=6
    clr = cm.spectral( np.linspace(0,1,10) ).tolist()
    # elbow curve
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(KK, betweenss/totss*100, 'b*-')
    ax.plot(KK[kIdx], betweenss[kIdx]/totss*100, marker='o', markersize=12,
    markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
    ax.set_ylim((0,100))
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Percentage of variance explained (%)')
    plt.title('Elbow for KMeans clustering')
    plt.savefig('admissions_elbow_klustering_analysis.eps')
    plt.show()
def setup_figure():
    fig=plt.figure(1)
    plt.clf()
    ax = fig.add_subplot(1,1,1)
    ax.set_xlim([-rho-1,rho+1])
    ax.set_ylim([-rho-1,rho+1])
    ax.set_aspect('equal')

    cells=[]
    springs=[]
    borders=[]
    for i in range(0,N):
        c = plt.Circle((-0,0),0.5,color=cm.copper(0))
        cells.append(ax.add_artist(c))

    if plot_springs:
        for i in range(0,len(pairs)):
            springs += ax.plot([], [], color=cm.spectral(0))

    if plot_voronoi:
        for i in range(0, pairs2.shape[0]):
            borders += ax.plot([], [], color='k')

    ang_mom = ax.add_patch(FancyArrowPatch((0,0),(1,1),ec='r', fc='r', zorder=0, arrowstyle=u'simple,head_width=20, head_length=10'))

    return(fig,cells,springs,borders,ang_mom)
Beispiel #3
0
def bar_graph(data, bar_names, x_label='', y_label='', title='', axis=None, colors=None, legend_place='lower right'):
    """Create horzontal bar chart with lists of data values.

    Plots a bar chart given a dictionary of *data* with a type as key, and a sequence of
    values corresponding to elements in *bar_names* as value.

    Place legend with *legend_place* as string argument matching
    /(lower|middle|upper) (right|center|left)/.
    """
    from matplotlib import cm
    fig = plt.figure()
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    ax = fig.add_subplot(111)

    num_groups = len(data.values()[0])
    group_size = len(data.values())
    yvals = np.arange(num_groups)
    width= 0.8/len(data.values())

    ps = []
    for i, vals in enumerate(data.values()):
        if colors is None:
            color = cm.spectral(1.*i/group_size) # colormaps: gist_rainbow, jet, hsv, spectral, ..
        else:
            color = colors[i%len(colors)]
        p = ax.barh(yvals+(width*i), vals, width, color=color)
        ps.append(p[0])

    plt.yticks(yvals+width, bar_names)
    if legend_place is not None:
        plt.legend( ps, data.keys(), loc=legend_place)

    plt.show()
Beispiel #4
0
    def get_colors(self, qty):

        qty = np.power(qty / qty.max(), 1.0 / CONTRAST)

        if COLORMAP == 0:
            rgba = cm.gray(qty, alpha=ALPHA)
        elif COLORMAP == 1:
            rgba = cm.afmhot(qty, alpha=ALPHA)
        elif COLORMAP == 2:
            rgba = cm.hot(qty, alpha=ALPHA)
        elif COLORMAP == 3:
            rgba = cm.gist_heat(qty, alpha=ALPHA)
        elif COLORMAP == 4:
            rgba = cm.copper(qty, alpha=ALPHA)
        elif COLORMAP == 5:
            rgba = cm.gnuplot2(qty, alpha=ALPHA)
        elif COLORMAP == 6:
            rgba = cm.gnuplot(qty, alpha=ALPHA)
        elif COLORMAP == 7:
            rgba = cm.gist_stern(qty, alpha=ALPHA)
        elif COLORMAP == 8:
            rgba = cm.gist_earth(qty, alpha=ALPHA)
        elif COLORMAP == 9:
            rgba = cm.spectral(qty, alpha=ALPHA)

        return rgba
Beispiel #5
0
	def plot_board(self, custom_text=''):
		X = self.X
		fig = plt.figure(figsize=(5,5))
		plt.xlim(-1,1)
		plt.ylim(-1,1)
		if self.mu and self.clusters:
			mu = self.mu
			clus = self.clusters
			K = self.K
			for m, clu in clus.items():
				cs = cm.spectral(1.*m/self.K)
				plt.plot(mu[m][0], mu[m][1], 'o', marker='*', \
						 markersize=12, color=cs)
				plt.plot(zip(*clus[m])[0], zip(*clus[m])[1], '.', \
						 markersize=8, color=cs, alpha=0.5)
		else:
			plt.plot(zip(*X)[0], zip(*X)[1], '.', alpha=0.5)
		if self.method == '++':
			tit = 'K-means++'
		else:
			tit = 'K-means with random initialization'
		# Scale the plot image
		# X lim
		plt.xlim([min(zip(*X)[0]),max(zip(*X)[0])])
		# Y lim
		plt.ylim([min(zip(*X)[1]),max(zip(*X)[1])])

		pars = 'N=%s, K=%s' % (str(self.N), str(self.K))
		plt.title('\n'.join([pars, tit]), fontsize=16)
		plt.savefig('kpp%s_N%s_K%s.png' % (custom_text, str(self.N), str(self.K)), \
					bbox_inches='tight', dpi=200)
	def __call__(self, event):
		if event.inaxes:
		  clickX = event.xdata
		  clickY = event.ydata
		  closest_i = 0
		  closest_dist = 10000000
		  if self.axis is None or self.axis==event.inaxes:
			cluster_num = None
			for i in range(0,len(self.data)):
				potential = self.distance(clickX, self.data[i][0], clickY, self.data[i][1])
				if potential < closest_dist:
					closest_dist = potential
					closest_i = i
			x = self.data[closest_i][0]
			y = self.data[closest_i][1]
			c = self.data[closest_i][2]
			cluster_num = c
			di = self.data[closest_i][3]
			du = self.data[closest_i][4]
			pa = self.data[closest_i][5]
			cal = self.data[closest_i][6]
			fu = self.data[closest_i][7]
			a.set_bbox(dict(facecolor=cm.spectral(float(c) / n_clusters, 1), alpha=.5))
			dist_text.set_text ("DIST (km) = %.3f" % di)
			dur_text.set_text("DUR (min) = %.3f" % du) 
			pace_text.set_text ("PACE (min/mi) = %.3f" % pa)
			cal_text.set_text ("CAL = %.3f" % cal)
			fuel_text.set_text ("FUEL = %.3f" % fu)
			
			
			num = 0
			clust_di = 0
			clust_du = 0
			clust_pa = 0
			clust_cal = 0
			clust_fu = 0
			for item in self.data:
				if item[2] == cluster_num:
					num += 1
					clust_di+=item[3]
					clust_du+=item[4]
					clust_pa+=item[5]
					clust_cal+=item[6]
					clust_fu+=item[7]
			clust_di /= float(num)
			clust_du /= float(num)
			clust_pa /= float(num)
			clust_cal /= float(num)
			clust_fu /= float(num)
			
			clust_dist_text.set_text ("DIST (km) = %.3f" % clust_di)
			clust_dur_text.set_text("DUR (min) = %.3f" % clust_du) 
			clust_pace_text.set_text ("PACE (min/mi) = %.3f" % clust_pa)
			clust_cal_text.set_text ("CAL = %.3f" % clust_cal)
			clust_fuel_text.set_text ("FUEL = %.3f" % clust_fu)
			
			figsrc.canvas.draw()
Beispiel #7
0
def plot_silhouette(sample_silhouette_values, cluster_labels):
    """
    Generate silhouette plot to elucidate number of clusters in data   
    
    Source: http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

    Arguments
    =========
    sample_silhouette_values - silhouette value for every observation
    cluster_labels - sequential numeric cluster numbers
    
    Returns
    =========
    None - the figure

    """
    # Initialise variables
    n_clusters = max(cluster_labels) - min(cluster_labels) + 1 # assume cluster number are sequential
    xMin = min(sample_silhouette_values)
    xMax = 1     
    # Create a subplot with 1 row and 2 columns
    fig = plt.figure()
    #fig.set_size_inches(18, 7)
    ax1 = plt.gca()
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    ax1.set_title('Silhouette Plot (k=%d)' % n_clusters)
    # The silhouette coefficient can range from -1, 1 
    ax1.set_xlim([xMin, xMax])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(cluster_labels) + (n_clusters + 1) * 10])
    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]
    
        ith_cluster_silhouette_values.sort()
    
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
    
        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)
    
        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    
        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples
    silhouette_avg = sample_silhouette_values.mean()
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--") # average line
Beispiel #8
0
 def silhouette_analysis(self):
     if not self.pca_reduced:
         self.pc_analysis()
     range_n_clusters = range(2, 10)
     for n_clusters in range_n_clusters:
         fig, (ax1, ax2) = plt.subplots(1, 2)
         fig.set_size_inches(18, 7)
         ax1.set_xlim([-0.1, 1])
         ax1.set_ylim([0, len(self.pca_reduced) + (n_clusters + 1) * 10])
         clusterer = KMeans(n_clusters=n_clusters, random_state=10)
         cluster_labels = clusterer.fit_predict(self.pca_reduced)
         silhouette_avg = silhouette_score(self.pca_reduced, cluster_labels)
         print("For n_clusters =", n_clusters, "the average silhouette_score is :", silhouette_avg)
         sample_silhouette_values = silhouette_samples(self.pca_reduced, cluster_labels)
         y_lower = 10
         for i in range(n_clusters):
             ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
             ith_cluster_silhouette_values.sort()
             size_cluster_i = ith_cluster_silhouette_values.shape[0]
             y_upper = y_lower + size_cluster_i
             color = cm.spectral(float(i) / n_clusters)
             ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values,
                               facecolor=color, edgecolor=color, alpha=0.7)
             ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
             y_lower = y_upper + 10
         ax1.set_title("The silhouette plot for the various clusters.")
         ax1.set_xlabel("The silhouette coefficient values")
         ax1.set_ylabel("Cluster label")
         ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
         ax1.set_yticks([])
         ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
         colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
         ax2.scatter(self.pca_reduced[:, 0], self.pca_reduced[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors)
         centers = clusterer.cluster_centers_
         ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200)
         for i, c in enumerate(centers):
             ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)
         ax2.set_title("The visualization of the clustered data.")
         ax2.set_xlabel("Feature space for the 1st feature")
         ax2.set_ylabel("Feature space for the 2nd feature")
         plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                       "with n_clusters = %d" % n_clusters),
                      fontsize=14, fontweight='bold')
Beispiel #9
0
 def __init__(self):
     self.window = GlutWindow(double=True, multisample=True)
     self.window.display_callback = self.display
     self.window.mouse_callback = self.mouse
     self.shader = ShaderProgram(vertex=vertex_shader, fragment=fragment_shader)
     self.shader.colormap = Texture1D(cm.spectral(linspace(0, 1, 256)), wrap_s="MIRRORED_REPEAT")
     self.shader.minval = (-2.5, -1.75)
     self.shader.maxval = (1.0, 1.75)
     self.vao = get_fullscreen_quad()
     self.history = []
Beispiel #10
0
def intraday_exec_curve(data=None,step_sec=60*30,group_var='strategy_name_mapped'):
    """
    intraday_exec_curve : 
    Plot the daily exec curve in turnover cross by group_var
    """
    ##############################################################
    # input handling
    ##############################################################
    if (data is None):
        raise NameError('plot:intraday_exec_curve - data is missing')
    
    ##############################################################
    # aggregate data
    ##############################################################  
    grouped=data.groupby([st_data.gridTime(date=data.index,step_sec=step_sec,out_mode='ceil'),group_var])
    grouped_data=pd.DataFrame([{'date':k[0],group_var:k[1],
                          'mturnover_euro': np.sum(v.rate_to_euro*v.price*v.volume)*1e-6} for k,v in grouped])
    grouped_data=grouped_data.set_index('date')
    # on passe en string parce que ca ne sorte pas sinon !!   
    grouped_data['tmpindex']=[datetime.strftime(x.to_datetime(),'%Y%m%d-%H:%M:%S.%f') for x in grouped_data.index]
    grouped_data=grouped_data.sort_index(by=['tmpindex',group_var]).drop(['tmpindex'],axis=1)

    ##############################################################
    # plot
    ##############################################################  
    # ----- NEEDED    
    uni_strat=np.sort(np.unique(grouped_data[group_var].values).tolist())
    colors_strat=cm.spectral(np.linspace(0, 1.0, len(uni_strat)))
    # ----- PLOT 
    plt.figure()
    plt.hold(True)
    prev_date=''
    prev_date_cum=0
    for i in range(grouped_data.shape[0]):
    #for i in range(20):
        date=grouped_data.index[i].to_datetime()
        idx_uni_strat=np.nonzero(uni_strat==grouped_data[group_var].ix[i])[0][0]
        if (not date==prev_date):
            plt.gca().fill([date-timedelta(seconds=step_sec),date,date,date-timedelta(seconds=step_sec)],
                   [0,0,grouped_data['mturnover_euro'].ix[i],grouped_data['mturnover_euro'].ix[i]],
                   facecolor=colors_strat[idx_uni_strat],alpha = 0.5)
                   
            prev_date_cum=grouped_data['mturnover_euro'].ix[i]
            # ,edgecolor='none'
        else:
            plt.gca().fill([date-timedelta(seconds=step_sec),date,date,date-timedelta(seconds=step_sec)],
                   [prev_date_cum,prev_date_cum,prev_date_cum+grouped_data['mturnover_euro'].ix[i],prev_date_cum+grouped_data['mturnover_euro'].ix[i]],
                   facecolor=colors_strat[idx_uni_strat],alpha = 0.5)
                   
            prev_date_cum=prev_date_cum+grouped_data['mturnover_euro'].ix[i]     
        prev_date=date
    
    plt.hold(False)
    plt.legend(uni_strat)
    plt.show()
Beispiel #11
0
    def plot_intraday_exec_curve(self, duration = "", step_sec=60*30, group_var='strategy_name_mapped'):
        """
        intraday_exec_curve : 
        Plot the daily exec curve in turnover cross by group_var
        """
        self.get_agg_deals(step_sec=step_sec)
        
        ##############################################################
        # plot
        ##############################################################  
        # ----- NEEDED    
        uni_strat = np.sort(np.unique(self.data_agg_deals[group_var].values).tolist())
        colors_strat = cm.spectral(np.linspace(0, 1.0, len(uni_strat)))
        uni_strat_islabeled = np.array([False]*len(uni_strat))
        # ----- PLOT
        h = plt.figure(figsize = DEFAULT_FIGSIZE)
        axes = plt.gca()
        axes.grid(True)
        
        plt.hold(True)
        prev_date=''
        prev_date_cum=0
        for i in range(self.data_agg_deals.shape[0]):
            #---
            date=self.data_agg_deals.index[i].to_datetime()
            idx_uni_strat=np.nonzero(uni_strat==self.data_agg_deals[group_var].ix[i])[0][0]
            #--
            args=[]
            if (not date==prev_date):
                args.append([date-timedelta(seconds=step_sec),date,date,date-timedelta(seconds=step_sec)])
                args.append([0,0,self.data_agg_deals['mturnover_euro'].ix[i],self.data_agg_deals['mturnover_euro'].ix[i]])
                prev_date_cum=self.data_agg_deals['mturnover_euro'].ix[i]
            else:
                args.append([date-timedelta(seconds=step_sec),date,date,date-timedelta(seconds=step_sec)])
                args.append([prev_date_cum,prev_date_cum,prev_date_cum+self.data_agg_deals['mturnover_euro'].ix[i],prev_date_cum+self.data_agg_deals['mturnover_euro'].ix[i]])
                prev_date_cum=prev_date_cum+self.data_agg_deals['mturnover_euro'].ix[i] 
            #--
            kwargs={'facecolor':colors_strat[idx_uni_strat],'alpha':0.85}
            if not uni_strat_islabeled[idx_uni_strat]:
                kwargs.update({'label':uni_strat[idx_uni_strat]})
                uni_strat_islabeled[idx_uni_strat]=True
            #--
            plt.gca().fill(*args,**kwargs)
            prev_date=date
            
        plt.hold(False)
        plt.ylabel('Turnover (,000,000) euros')
        plt.title('Intraday traded curve: ' + duration, size = 'large')
        plt.legend()   

        return h
def timedomain(ycsb, toplt):
    arrays_k, arrays_v = splitbyrecordcount(ycsb[toplt])
    arrays_ku, arrays_vu = splitbyrecordcount(ycsb[2])
    arrays_kr, arrays_vr = splitbyrecordcount(ycsb[1])
    arrays_kv, arrays_vv = splitbyrecordcount(ycsb[0])

    maxheightu = max([max(x) for x in arrays_vu[1:9]])
    maxheightr = max([max(x) for x in arrays_vr[1:9]])
    maxheightv = max([max(x) for x in arrays_vv[1:9]])
    maxheight = max(maxheightu, maxheightr, maxheightv)
    #print maxheight

    K = []
    K.extend(arrays_k)

    V = []
    V.extend(arrays_v)

    #K = [ K[1], K[11], K[21] ]
    #V = [ V[1], V[11], V[21] ]

    checktype = ( "Update", "Read", "Verification" )[toplt]

    fig = plt.figure()
    ax = fig.add_subplot('111', projection='3d')

    it = 0
    for z in np.arange(1, 9):
        xs = K[z]
        ys = V[z]
        c = colmap.spectral(z/9.,1)
        ax.plot(xs, z * np.ones(xs.shape), zs=ys, zdir='z', color=c, zorder = -z)

    # Plot formatting
    font = {'family' : 'serif',
            'weight' : 'normal',
            'size'   : 12}
    plt.rc('font', **font)
    #plt.zlim(0, maxheight)

    #plt.legend(checktype, loc=2, bbox_to_anchor=(1.05, 1),
                #borderaxespad=0. )
    ax.set_zlim3d(0, maxheight)
    ax.set_xlabel('Time (ms)')
    ax.set_ylabel('Test Run')
    ax.set_zlabel('Runtime')
    ax.tick_params(axis='both', labelsize = 8)
    plt.savefig( getfilename("timeseries", checktype),
                 format='png', dpi=300, bbox_inches='tight',
                 transparent=True )
def animate(k):
    i = int(k/3)
    if k == 1:
        ax.view_init(20, 215)
        for j, y in enumerate(ys):
            y_seg = y[0:2]
            plot2(y_seg, fig, cm.spectral(j/len(ys)))
        ax.scatter(0.16, 0.16, 0.16, c="g", alpha=0.4, s=500)
        ax.scatter(0.82, 0.17, 0.17, c="b", alpha=0.4, s=500)
        ax.scatter(0.17, 0.82, 0.17, c="r", alpha=0.4, s=500)
        ax.scatter(0.17, 0.17, 0.82, c="k", alpha=0.4, s=500)
        set_title("Decision Space")
    if i > 0 and i < N:# ys.shape[1]:
        ax.view_init(20, 215+ANGLE1*k/N/3)
        for j, y in enumerate(ys):
            y_seg = y[i-1:i+1]
            plot2(y_seg, fig, cm.spectral(j/len(ys)))
        set_title("Decision Space")
    elif i >= N:# ys.shape[1]:
        ax.set_axis_off()
        j = k - 3*N
        print "rotate" + str(j)
        ax.view_init(20, (215+ANGLE1+ANGLE2*3*j/int(ANGLE2))%360)
Beispiel #14
0
def Silhouette(D,labels,k):
    """
    Taken from SKlearn's plot kmeans example
    D = matriz de distancia
    k = numero de clusters
    """
    plt.ion()
    fig, ax1 = plt.subplots()
    fig.set_size_inches(18, 7)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(D) + (k + 1) * 10])
    
    sample_silhouette_values = metrics.silhouette_samples(D , labels, metric='precomputed')
    
    y_lower = 10
    
    for i in range(k):
        ith_cluster_silhouette_values = \
                sample_silhouette_values[labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / k)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                        0, ith_cluster_silhouette_values,
                                        facecolor=color, edgecolor=color, alpha=0.7)

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        
        y_lower = y_upper + 10  
    
    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    silhouette_avg = metrics.silhouette_score(D , labels, metric='precomputed')	
    
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([]) 
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    
    plt.suptitle(("Silhouette analysis with n_clusters =",k," and average = ",silhouette_avg),
    fontsize=14, fontweight='bold')

    plt.show()
Beispiel #15
0
def plot_partial_factors(ds,sts,x=0,y=1,cmap=None,axes='off',
                        nude=False):

    mx = np.max(np.abs(ds.samples))
    xmx = mx*1.1
    hw = .05*xmx
    w = .01*xmx
    plt.arrow(-xmx,0,2*xmx,0,color = 'gray',alpha=.7,width=w,
                head_width=hw,length_includes_head=True)
    plt.arrow(0,-mx,0,2*mx,color = 'gray',alpha=.7, width=w,
                head_width=hw,length_includes_head=True)
    ntables = len(np.unique(ds.chunks))
    if cmap is None:
        cmap = cm.spectral(np.linspace(.2,.85,ntables))
    m,ncol = ds.shape
    nrows = m/ntables
    data = ds.samples.T.reshape((ncol,nrows,ntables),order='F')

    centers = np.mean(data,2).T[:,[x,y]]
    plt.scatter(centers[:,0],centers[:,1])

    for t in range(ntables):
        tab = data[:,:,t].T[:,[x,y]]
        for r in range(nrows):
            a,b = centers[r,:]
            j,k = tab[r,:]
            plt.plot([a,j],[b,k],c=cmap[t],lw=2,alpha=.5)
    plt.axis('equal')
    plt.axis((-mx,mx,-mx,mx))
    #plt.axis('equal')
    plt.axis(axes)
    if not nude:
        for t in range(nrows):
            plt.annotate(ds.targets[t],xy = (centers[t,0], centers[t,1]))

        plt.text(-xmx,.05*mx,'$\lambda = %s$'%np.round(sts.eigv[x],2))
        plt.text(mx*.05,mx*.9,'$\lambda = %s$'%np.round(sts.eigv[y],2))
        tau = '$\\tau = $'
        perc = '$\%$'
        mpl.rcParams['text.usetex'] = False
        plt.text(-xmx,-.1*mx, '%s $%s$%s' %
                (tau,np.round(100*sts.inertia[x],0),perc))
        plt.text(xmx*.05,mx*.8, '%s $%s$%s' %
                (tau,np.round(100*sts.inertia[y],0),perc))
        plt.text(-.15*xmx,.8*mx,'$%s$'%(y+1), fontsize=20)
        plt.text(xmx*.85,-mx*.2,'$%s$'%(x+1),fontsize=20)

    plt.axis('scaled')
    plt.axis([-xmx,xmx,-mx,mx])
def clst_hier(Data_matrix, Linkage_Method, Pdist, n_clusters):
    #pdb.set_trace()
    #hierarchical clustering
    Dist_matrix = dt.pdist(Data_matrix, Pdist)
    Dist_matrix = dt.squareform(Dist_matrix) #, checks=True) --> returns a square matrix; needed for other methods of linkage
    #check its histogram:
    #(h,b)=np.histogram(Dist_matrix)
    #(f_hist, axis_hist)=plt.subplots()
    #axis_hist.plot(b[1:], h)
    #f_hist.show()

    #pdb.set_trace()
    #Hier_clustering = hr.linkage(Dist_matrix) #, method='centroid') #, method=Linkage_Method, metric=Pdist)
    Hier_clustering = hr.linkage(Dist_matrix, method=Linkage_Method, metric=Pdist)

    #draw dendrogram
    dendro = hr.dendrogram(Hier_clustering)

    #plt.show()
    #try to get current axes & modify & save figures
    ax_dendro = plt.gca()
    fig_dendro = plt.gcf()
    #pdb.set_trace()
    fig_dendro.savefig(Case_loc+'fig_dendrogram.png')

    #pdb.set_trace()
    #n_cluster_list = list()
    tmp_n_clusters = 0
    for ith_t in Hier_clustering[:,2]:
        cluster_labels = hr.fcluster(Hier_clustering, ith_t, criterion=FCluster_Criterion)
        cluster_labels = cluster_labels - 1 # start from 0
        tmp_n_clusters = cluster_labels.max()+1 # cluster index = {0,...,N-1} --> N clusters
        if tmp_n_clusters == n_clusters:
            break
    
    if tmp_n_clusters == 0:
        print('unable to find %d clusters in clst_hier'%n_clusters)
        pdb.set_trace()

    color_matrix = np.zeros(len(cluster_labels)*4)
    color_matrix = color_matrix.reshape((len(cluster_labels), 4))
    for i in range(n_clusters):
        ith_cluster_color = cm.spectral(float(i) / n_clusters)
        color_matrix[cluster_labels==i] = ith_cluster_color
        
    SD = 0 #currently not found intertia for hier method

    return [cluster_labels, color_matrix, SD]
def animate(f):
    global pairs, pairs2
    # load data
    F=F_vs_t[f]
    r=r_vs_t[f]
    n=n_vs_t[f]
    p=(rho+0.9)*p_angular[f]/np.sqrt(np.sum(p_angular[f]**2))

    ang_mom.set_positions((0, 0), (p[x_plane], p[y_plane]))

    if update_nn:
        pairs = simforces.get_all_pairs(getDelaunayTrianglesOnSphere(r)+1)
    
    for i in range(0,N):
        #j=indsort[i]
        c=int((r[z_plane,i]+1)/2*256)
        cells[i].center=(r[x_plane,i],r[y_plane,i])
        cells[i].set_facecolor(cm.copper(c))
        cells[i].set_zorder(r[z_plane,i])

    if plot_springs:
        for i in range(0,len(pairs)):
            i1 = pairs[i,0] - 1
            i2 = pairs[i,1] - 1
            if (r[z_plane,i1] > 0) and (r[z_plane,i2] > 0):
                dist = np.sqrt(np.sum((r[:,i1]- r[:,i2])**2))
                c=int((dist-1)*128)
                springs[i].set_data([r[x_plane,i1], r[x_plane,i2]], [r[y_plane,i1], r[y_plane,i2]])
                springs[i].set_color(cm.spectral(c))
            else:
                springs[i].set_data([], [])

    if plot_voronoi:
        list_, baricenters, out_polygon_dict, pairs2, all_areas = getVoronoiOnSphere(r)
        b = rho*baricenters
        for i in range(0,len(pairs2)):
            i1 = pairs2[i,0]
            i2 = pairs2[i,1]
            if (b[z_plane,i1] > 0) and (b[z_plane,i2] > 0):
                borders[i].set_data([b[x_plane,i1], b[x_plane,i2]], [b[y_plane,i1], b[y_plane,i2]])
            else:
                borders[i].set_data([], [])

    if f == 20:
        fig.savefig('test.png')    

      
    return (cells,springs,borders,ang_mom)
def display_climate_radial(geography, year, tempdata, raindata):

    #Create figure and polar axis
    fig = plt.figure('%s_radial' % geography, facecolor='white', figsize=(8,8))
    ax = fig.add_subplot(111, polar = True, frameon=False)

    mintemp=-30
    maxtemp=40
    ax.text(0,mintemp, geography.upper(), color='#555555', horizontalalignment='center', size=30)
    ax.text(0,maxtemp+1, str(year), color='#555555', horizontalalignment='center', size=10)

    #Min/Max temps as bars
    for i,(tmin,tmax,tmean) in enumerate(tempdata):
        if np.abs(tmax-tmin)<1:
            tmin=tmin-0.5
            tmax=tmax+0.5
        ax.plot([2*np.pi*i/365.0]*2, [tmin,tmax], color=cm.spectral((tmean+5)/45.0), linewidth=1.5, alpha=0.6);

    # plot rainfall as scatters
    ax.scatter([2*np.pi*r/365. for r in raindata['rainydays']], raindata['tcenters'], s=[100*r for r in raindata['rainfalls']], alpha=0.5, facecolor='#99aacc', linewidth=0)

    # tweak ranges and orientation of polar plot
    ax.set_rmax(maxtemp)
    ax.set_rmin(mintemp)
    ax.set_theta_direction(-1)
    ax.set_theta_zero_location("N")

    #Tweak polar axes, gridding, labels
    ax.tick_params(axis='both', colors='#bbbbbb')

    ax.set_xticks([m*2*np.pi/12 for m in range(12)])
    months = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
    ax.set_xticklabels( months, fontsize=10 )
    ax.get_xaxis().grid(False)

    plt.rgrids( (0.01, 10, 20, 30, 40), labels=('0 C', '', '20 C', '', '40 C' ), angle=180) # radii only positive here, but override later
    ax.get_yaxis().grid(which='minor',linestyle='-',color='#bbbbbb', alpha=0.3)
    ax.get_yaxis().grid(which='major',linestyle='-',color='#bbbbbb', alpha=0.4, linewidth=1.4)
    ax.set_yticks([10, 30], minor=True)
    ax.set_yticks([0, 20, 40])
    ax.set_yticklabels( ['0 C', '20 C', '40 C' ], fontsize=10)

    plt.show()
	def Cluster(self, event=None):
		global x,y,c,dist,dur,pace,calories,fuel,figsrc
		x=[]
		y=[]
		c=[]
		print self.bool_vec
		
		delaxes(self.axsrc)
		
		self.axsrc = figsrc.add_subplot(211, autoscale_on=True)			
		self.axsrc.set_title('Right Click to Zoom')
		
		def select(vec): return [elem for elem,b in zip(vec,self.bool_vec) if b]
		X = [ select(elem) for elem in self.master]
		print X[0]
		km = MiniBatchKMeans(k=n_clusters, init='random', n_init=10,
					 random_state=random_state).fit(X)
		pca = decomposition.PCA(n_components=2)
		pca.fit(X)
		X = pca.transform(X)
				
		for k in range(n_clusters):
			my_members = km.labels_ == k
			color = cm.spectral(float(k) / n_clusters, 1)
			x.extend(X[my_members, 0])
			y.extend(X[my_members, 1])
			for i in range(0,len(X[my_members])):
				c.append(k)
			plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color)
			#cluster_center = km.cluster_centers_[k]
			cluster_center = find_center(X[my_members])
			print "Center: "
			plot(cluster_center[0], cluster_center[1], 'o',
				markerfacecolor=color, markeredgecolor='k', markersize=7)
			title("Cluster View")

		self.master_lx_lim = self.axsrc.get_xlim()[0]
		self.master_ly_lim = self.axsrc.get_ylim()[0]
		self.master_ux_lim = self.axsrc.get_xlim()[1]
		self.master_uy_lim = self.axsrc.get_ylim()[1]
		self.af = AnnoteFinder(x,y,c,dist,dur,pace,calories,fuel, self.axsrc)
		figsrc.canvas.mpl_connect('button_press_event', self.af)
Beispiel #20
0
    def get_contrib_colour(self, contrib_key):
        """

        :param contrib_key:
        :return: :raise ke:
        """
        try:
            return self._contrib_colour_dict[contrib_key]
        except (AttributeError, KeyError):
            self.contrib_colourmap = [
                cm.spectral(i) for i in np.linspace(0, 0.9, self.ctl.get_max_node_contribs())]
            self._contrib_colour_dict = {}
            for i, contrib_key in enumerate(self.ctl.get_contrib_keys()):
                self._contrib_colour_dict[
                    contrib_key] = self.contrib_colourmap[i]
            try:
                return self._contrib_colour_dict[contrib_key]
            except KeyError as ke:
                logging.error("CDict:{0!s}".format(self._contrib_colour_dict))
                raise ke("CDict:{0!s}".format(self._contrib_colour_dict))
def drawLimits(useObjects,filt="Ks",useGrayscale=False):
	colorDict={}
	for ii,obj in enumerate(useObjects):
		if obj not in limitDict[filt].keys():
			print obj + " not found"
		else:
			if useGrayscale:
				colormap = cm.gray((ii)/(float(len(useObjects))),1)
			else:
				colormap = cm.spectral((ii)/(float(len(useObjects))),1)
			dists = []; deltaMag = []
			for xx in range(len(apertures)):
				if limitDict[filt][obj][xx][1] != "-":
					dists.append(eval(apertures[xx]))
					deltaMag.append(eval(limitDict[filt][obj][xx]))
		#	print "\n\n",dists,"\n",deltaMag,"\n\n"
				
			pylab.semilogx(dists,deltaMag,linewidth=2,linestyle='solid',color=colormap,label=obj+" "+instrUsed)
			colorDict[obj]=colormap
	return colorDict
Beispiel #22
0
def plot_embedding(x, y, selected=None, group=None, dpi=80, **extra):
    ''' Plot an embedding
    
    :Parameters:
    
    x : array
        X coordindates
    y : array
        Y coordindates
    selected : array, optional
               Plot selected points
    dpi : int
          Figure resolution
    extra : dict
            Unused key word arguments
    
    :Returns:
    
    fig : Figure
          Matplotlib figure
    ax : Axes
          Matplotlib axes
    '''
    
    fig = pylab.figure(dpi=dpi)
    ax = fig.add_subplot(111)
    if group is not None:
        refs = numpy.unique(group)
        beg, inc = 0.0, 1.0/len(refs)
        for r in refs:
            sel = r == group
            color = cm.spectral(beg)#@UndefinedVariable
            ax.plot(x[sel], y[sel], 'o', ls='.', markersize=3, c=color, **extra)
            beg += inc
    else:
        ax.plot(x, y, 'ro', ls='.', markersize=3, **extra)
    if selected is not None:
        ax.plot(x[selected], y[selected], 'k+', ls='.', markersize=2, **extra)
    return fig, ax
def clst_kmeans(Data_matrix, n_clusters):
    #pdb.set_trace()
    print('kmeans starts')
    t0 = time()
    if isWin==1:
        clusterer = KMeans(n_clusters=n_clusters, random_state=rand_seed)
    else:
        clusterer = KMeans(k=n_clusters, random_state=rand_seed) #for linux
    clusterer.fit(Data_matrix)
    t1 = time()
    print('\tkmeans finishes with %.2g sec' % (t1-t0))

    #pdb.set_trace()
    cluster_labels = clusterer.labels_
    SD = clusterer.inertia_ #SD: sum of distortion =
                            #Sum of distances of samples to their closest cluster center
    color_matrix = np.zeros(len(cluster_labels)*4)
    color_matrix = color_matrix.reshape((len(cluster_labels), 4))
    for i in range(n_clusters):
        ith_cluster_color = cm.spectral(float(i) / n_clusters)
        color_matrix[cluster_labels==i] = ith_cluster_color

    return [cluster_labels, color_matrix, SD]
Beispiel #24
0
        std_rough_times = np.std(rough_times, axis=0)

        ratios[n,:] = avg_rough_times/smooth_timescales

        all_Vrands[n,:] = Vrand + Vsmooth
        all_avg_rough_times[n,:] = avg_rough_times
        all_std_rough_times[n,:] = std_rough_times

    if not os.path.exists("double_plots"):
        os.mkdir("double_plots")
    os.chdir("double_plots")

    fig_temp = plt.figure(2)
    for n in range(ndEs):
        c_val = float(n)/ndEs
        plt.plot(ratios[n,:], 'o', ms=10, c=cm.spectral(c_val))
    plt.semilogy()
    plt.xlabel("index")
    plt.ylabel("Ratio of timescales $\\frac{t_i}{t_i^0}$")
    fig_temp.savefig("ti_norm_vs_index.png",bbox_inches="tight")
    fig_temp.savefig("ti_norm_vs_index.pdf",bbox_inches="tight")

    fig1 = plt.figure()
    for n in range(ndEs):
        c_val = float(n)/ndEs
        if n == 0:
            plt.plot(all_avg_rough_times[n,:], 'o', ms=10, c=cm.spectral(c_val),label="matrix")
        else:
            plt.plot(all_avg_rough_times[n,:], 'o', ms=10, c=cm.spectral(c_val))
    plt.plot(smooth_timescales, 'k')
    plt.ylabel("Implied timescales")
Beispiel #25
0
    # y_savg.append(silhouette_avg)


    y_lower = 10
    for i in range(cluster):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / cluster)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax.set_title("The silhouette plot for the various clusters.")
    ax.set_xlabel("The silhouette coefficient values")
    ax.set_ylabel("Cluster label")

    # The vertical line for average silhoutte score of all the values
#settings for axes/ticks widths, etc. -> Called for any plotting
################################################################
plt.rc('axes',linewidth=0.75)#axis border widths (I tend to like bolder than the default)
plt.rc('xtick.major',width=0.75)#tick widths (I like them the same width as the border)
plt.rc('ytick.major',width=0.75)
plt.rc('xtick.minor',width=0.75)
plt.rc('ytick.minor',width=0.75)
plt.rc('lines',markersize=4,markeredgewidth=0.0)#size of markers,no outline

#If you want a range of colors, this is a useful function to generate an equally
#spaced range
import matplotlib.cm as cm
num_colors = 9
colors = np.zeros([num_colors,4])#the four is constant
for i in np.arange(num_colors):
    c = cm.spectral(i/float(num_colors),1)
    colors[i,:]=c
#then, call color=colors[x,:] in the plot routine

#These are possible marker styles
points = ['o','v','s','p','*','h','^','D','+','>','H','d','x','<']


###########################################################
#Two subplotting options: call subplot or manually set axes
###########################################################

####Using Subplot####
#default settings for margin (can be tweaked accordingly
left  = 0.2  # the left side of the subplots of the figure
right = 0.95    # the right side of the subplots of the figure
Beispiel #27
0
    def bench_k_means(self, data, name, save=False, path='', plot=True):
        """ Silhouette analysis

            :param data: dataset trasposed
            :param name: component name (gravity or body)
            :param save: bool parameter that indicates if the plots are saved
            :param path: path where the plots will be saved
            :return Koptimal: optimal number of clusters to be used to cluster the data of the given dataset"""

        #In this example the silhouette analysis is used to choose an optimal value for n_clusters.
        #Bad pick for the given data due to the presence of clusters with
        #below average silhouette scores and also due to wide fluctuations in the size of the silhouette plots.

        threshold = 0.69

        #t0 = time()
        X = data
        cmin = 2
        cmax = 50

        for n_clusters in range(cmin, cmax):
            # Create a subplot with 1 row and 2 columns

            if (plot == True):

                fig, (ax1, ax2) = plt.subplots(1, 2)
                fig.set_size_inches(18, 7)

                # The 1st subplot is the silhouette plot
                # The silhouette coefficient can range from -1, 1 but in this example all
                # lie within [-0.1, 1]
                ax1.set_xlim([-0.1, 1])
                # The (n_clusters+1)*10 is for inserting blank space between silhouette
                # plots of individual clusters, to demarcate them clearly.
                ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

            # Initialize the clusterer with n_clusters value and a random generator
            # seed of 10 for reproducibility.
            clusterer = KMeans(n_clusters=n_clusters, random_state=10)
            cluster_labels = clusterer.fit_predict(X)
            #cluster_labels = clusterer.fit(X)

            # The silhouette_score gives the average value for all the samples.
            # This gives a perspective into the density and separation of the formed
            # clusters
            silhouette_avg = silhouette_score(X,
                                              cluster_labels,
                                              metric='sqeuclidean')
            print("For n_clusters =", n_clusters,
                  "The average silhouette_score is :", silhouette_avg)

            # Compute the silhouette scores for each sample
            sample_silhouette_values = silhouette_samples(X, cluster_labels)
            #print sample_silhouette_values

            Koptimal = n_clusters
            #if (Koptimal == maxK):
            #print('MATLAB:noConvergence','Failed to converge to the optimal K: increase maxK.')

            if (silhouette_avg < threshold):
                return (Koptimal)

            y_lower = 10
            for i in range(n_clusters):
                # Aggregate the silhouette scores for samples belonging to
                # cluster i, and sort them
                ith_cluster_silhouette_values = \
                    sample_silhouette_values[cluster_labels == i]

                ith_cluster_silhouette_values.sort()

                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                y_upper = y_lower + size_cluster_i

                color = cm.spectral(float(i) / n_clusters)

                if (plot == True):
                    ax1.fill_betweenx(arange(y_lower, y_upper),
                                      0,
                                      ith_cluster_silhouette_values,
                                      facecolor=color,
                                      edgecolor=color,
                                      alpha=0.7)

                    # Label the silhouette plots with their cluster numbers at the middle
                    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

                    # Compute the new y_lower for next plot
                    y_lower = y_upper + 10  # 10 for the 0 samples

            if (plot == True):
                ax1.set_title("The silhouette plot for the various clusters.")
                ax1.set_xlabel("The silhouette coefficient values")
                ax1.set_ylabel("Cluster label")

                # The vertical line for average silhoutte score of all the values
                ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

                ax1.set_yticks([])  # Clear the yaxis labels / ticks
                ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

                # 2nd Plot showing the actual clusters formed
                colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
                ax2.scatter(X[:, 0],
                            X[:, 1],
                            marker='.',
                            s=30,
                            lw=0,
                            alpha=0.7,
                            c=colors)

                # Labeling the clusters
                centers = clusterer.cluster_centers_
                # Draw white circles at cluster centers
                ax2.scatter(centers[:, 0],
                            centers[:, 1],
                            marker='o',
                            c="white",
                            alpha=1,
                            s=200)

                for i, c in enumerate(centers):
                    ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

                ax2.set_title("The visualization of the clustered data.")
                ax2.set_xlabel("Feature space for the 1st feature")
                ax2.set_ylabel("Feature space for the 2nd feature")

                plt.suptitle((
                    "Silhouette analysis for KMeans clustering on sample data "
                    "with n_clusters = %d" % n_clusters),
                             fontsize=14,
                             fontweight='bold')
                plt.savefig(path + '/' + str(name) + "_c_" + str(n_clusters) +
                            '.png')

        return Koptimal
def plot_insertions_two_panels(fname, seqs, gene, domain, tax, id2name):
    """
    plot insertions wrt model positions
    2 panels:
        1. insertions with ORF
            - triangle if has intron
            - circle if no intron
        2. insertion does not have ORF
            - triangle if has intron
            - circle if no intron
        *3. conservation of sequence in model
        *4. e. coli positions
    # seqs[id] = [gene, model, [[i-gene_pos, i-model_pos, i-length, orf, intron], ...]]
    """
    # import
    import matplotlib
    import matplotlib.pyplot as plt
    import numpy as np
    from matplotlib.backends.backend_pdf import PdfPages
    import matplotlib.cm as cm
    import matplotlib.colors as col
    from itertools import cycle as cycle
    from matplotlib.font_manager import FontProperties
    plt.rcParams['pdf.fonttype'] = 42  # illustrator pdf
    # axis
    max_insert = max_insertion(seqs, gene, domain)
    height = max_insert + max_insert * 0.10
    xmax = model_length(gene, domain)
    f, axarr = plt.subplots(2, sharex=True, sharey=True)
    plt.axis([0, xmax, 0, height])
    plt.xticks(np.arange(0, xmax, 100), rotation=45)
    plt.yticks(np.arange(0, height, 100))
    # labels
    axarr[0].set_title('encodes ORF')
    axarr[1].set_title('does not encode ORF')
    plt.suptitle('%s %s rRNA gene insertions' % (domain, gene))
    plt.ylabel('insertion length (bp)')
    plt.xlabel('position on %s %s rRNA gene model' % (domain, gene))
    # colors
    color2tax = {}
    if tax is False:
        taxa = ['n/a']
    else:
        taxa = sorted(
            set([
                tax[id2name[i]] for i, j in list(seqs.items())
                if j[2] != [] and id2name[i] in tax
            ]))
        if 'n/a' not in taxa:
            taxa.append('n/a')
    colors = cm.spectral(np.linspace(0, 1, len(taxa)))
    colors = cycle(colors)
    for t in taxa:
        color2tax[t] = next(colors)
    # plot
    for name, seq in list(seqs.items()):
        g, d = seq[0], seq[1]
        if g != gene or d != domain or seq[2] == []:
            continue
        if tax is False or id2name[name] not in tax:
            t = 'n/a'
        else:
            t = tax[id2name[name]]
        c = color2tax[t]
        for ins in seq[2]:
            x, y = int(ins[1]), int(ins[2])
            if ins[4] == True:  # has intron, set marker
                marker, size = '^', 30
            else:
                marker, size = 'o', 30
            if ins[3] == True:  # has orf, plot separately
                axarr[0].scatter(x, y, marker = marker, s = size, facecolors = 'none', \
                        clip_on = False, edgecolors = c, label = t)
            else:
                axarr[1].scatter(x, y, marker = marker, s = size, facecolors = 'none', \
                        clip_on = False, edgecolors = c, label = t)
    # legend
    boxes = [
        matplotlib.patches.Rectangle((0, 0), 1, 1, fc=color2tax[t])
        for t in taxa
    ]
    names = [t for t in taxa]
    plt.legend(boxes,
               names,
               prop={'size': 10},
               loc='center left',
               bbox_to_anchor=(1, 0.5),
               scatterpoints=1)
    # save
    figure = plt.gcf()
    figure.set_size_inches(20, 12)
    pdf = PdfPages('%s.%s-%srRNAgene-insertions.pdf' %
                   (fname.rsplit('.')[0], domain, gene))
    pdf.savefig()
    plt.close()
    pdf.close()
Beispiel #29
0
def run_silhouette_analysis(**kargs):
    from sklearn.metrics import silhouette_samples, silhouette_score
    # import matplotlib.cm as cm
    tFoundManifoldMethod = False
    try:
        import learn_manifold
        tFoundManifoldMethod = True
    except:
        pass

    # [params] input
    X = kargs['X']
    y = kargs.get('y', None)
    assert X is not None and X.shape[0] > 1
    N = X.shape[0]
    n_clusters_max = max(2, N / 2)

    range_n_clusters = kargs.get('range_n_clusters',
                                 range(2, n_clusters_max, 5))
    n_clusters_min, n_clusters_max = min(range_n_clusters), max(
        range_n_clusters)
    identifier = kargs.get('identifier',
                           'nCm%d_M%d' % (n_clusters_min, n_clusters_max))
    dim0 = X.shape[1]
    if kargs.get(
            'reduce_dimension', False
    ) and tFoundManifoldMethod:  # dimensionality reduction prior to gap statistical analysis
        Xp = learn_manifold.tsne(X,
                                 identifier=identifier)  # use t-SNE by default
        print('run_silhouette_analysis> dim of X from %d to %d' %
              (dim0, Xp.shape[1]))
    else:
        Xp = X

    # Use 'Xp' from this point onwards

    # [params]
    # range_n_clusters = kargs.get('range_n_clusters', [2, 3, 4, 5, 6, 10, 15, 20])
    n_clusters_requested = kargs.get('n_clusters', None)
    if n_clusters_requested is not None:
        if not n_clusters_requested in range_n_clusters:
            range_n_clusters.append(n_clusters_requested)
    print('param> input n_clusters (requested): %s > range_n_clusters: %s' %
          (n_clusters_requested, range_n_clusters))

    # identifier
    identifier = kargs.get(
        'identifier',
        'nR%s-%s' % (min(range_n_clusters), max(range_n_clusters)))
    outputdir = kargs.get('outputdir', os.path.join(os.getcwd(), 'plot'))
    if not os.path.exists(outputdir): os.makedirs(outputdir)  # base directory

    ranked_scores = []
    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        plt.clf()

        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(Xp) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(Xp)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(Xp, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
        ranked_scores.append((n_clusters, silhouette_avg))

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(Xp, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(Xp[:, 0],
                    Xp[:, 1],
                    marker='.',
                    s=30,
                    lw=0,
                    alpha=0.7,
                    c=colors)

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0],
                    centers[:, 1],
                    marker='o',
                    c="white",
                    alpha=1,
                    s=200)

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(
            ("Silhouette analysis for KMeans clustering on sample data "
             "with n_clusters = %d" % n_clusters),
            fontsize=14,
            fontweight='bold')

        # plt.show()
        graph_ext = 'tif'
        fpath = os.path.join(
            outputdir,
            'silhouette_test-%s-nC%s.%s' % (identifier, n_clusters, graph_ext))
        print('output> saving silhouette test result to %s' % fpath)
        plt.savefig(fpath)
    ### end range of n_clusters

    ranked_scores = sorted(ranked_scores,
                           key=lambda x: abs(x[1]),
                           reverse=False)  # reverse=False => ascending
    print('output> ranked scores (n_clusters vs average score):\n%s\n' %
          ranked_scores)

    return ranked_scores[0][0]
Beispiel #30
0
def run_kmodes(syms, X, n, alpha):
    if os.path.isfile("%d_CLUSTERS.pkl" % (n, )):
        X_ENC, clusters, centroids = pickle.load(
            open("%d_CLUSTERS.pkl" % (n, ), "r"))

        # Create a subplot with 1 row and 2 columns
        fig, ax1 = plt.subplots(1, 1)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X_ENC) + (n + 1) * 10])

        sil_avg = silhouette_score(X_ENC, clusters, metric=simple_compare)
        print("For n_clusters =", n, "The average silhouette_score is :",
              sil_avg)
        sample_silhouette_values = silhouette_samples(X_ENC,
                                                      clusters,
                                                      metric=simple_compare)

        y_lower = 10
        for i in range(n):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = sample_silhouette_values[clusters
                                                                     == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / n)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        print sample_silhouette_values

        ax1.set_title("The silhouette plot for %d clusters." % (n, ))
        ax1.set_xlabel("The silhouette coefficient values (AVF = %f)" %
                       (sil_avg, ))
        ax1.set_ylabel("Cluster label")
        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=sil_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        mng = plt.get_current_fig_manager()
        mng.window.state('zoomed')

        plt.show()
        #        fig.savefig('%d_SILS.png' % (np.amax(clusters)+1,))
        #        plt.close(fig)
        return (np.amax(clusters) + 1, sil_avg)
def plot_insertions(fname, seqs, gene, domain, tax, id2name):
    """
    plot insertions wrt model positions
    2 panels:
        1. insertions with ORF
            - triangle if has intron
            - circle if no intron
        2. insertion does not have ORF
            - triangle if has intron
            - circle if no intron
        *3. conservation of sequence in model
        *4. e. coli positions
    # seqs[id] = [gene, model, [[i-gene_pos, i-model_pos, i-length, orf, intron], ...]]
    """
    # import
    import matplotlib
    import matplotlib.pyplot as plt
    import numpy as np
    from matplotlib.backends.backend_pdf import PdfPages
    import matplotlib.cm as cm
    import matplotlib.colors as col
    from matplotlib.font_manager import FontProperties
    plt.rcParams['pdf.fonttype'] = 42  # illustrator pdf
    # axis
    max_insert = max_insertion(seqs, gene, domain)
    height = max_insert + max_insert * 0.10
    xmax = model_length(gene, domain)
    f, axarr = plt.subplots(4, sharex=True, sharey=True)
    plt.axis([0, xmax, 0, height])
    plt.xticks(np.arange(0, xmax, 100), rotation=45)
    plt.yticks(np.arange(0, height, 200))
    # labels
    axarr[0].set_title('encodes ORF and intron')
    axarr[1].set_title('encodes ORF, no intron')
    axarr[2].set_title('encodes intron, no ORF')
    axarr[3].set_title('no intron, no ORF')
    plt.suptitle('%s %s rRNA gene insertions' % (domain, gene))
    plt.ylabel('insertion length (bp)')
    plt.xlabel('position on %s %s rRNA gene model' % (domain, gene))
    # colors
    color2tax = {}
    if tax is False:
        taxa = ['n/a']
    else:
        taxa = sorted(
            set([
                tax[id2name[i]] for i, j in list(seqs.items())
                if j[2] != [] and id2name[i] in tax
            ]))
        if 'n/a' not in taxa:
            taxa.append('n/a')
    colors = cm.spectral(np.linspace(0, 1, len(taxa)))
    colors = cycle(colors)
    for t in taxa:
        color2tax[t] = next(colors)
    # markers
    markers = setup_markers(seqs)
    # plot
    for name, seq in list(seqs.items()):
        g, d = seq[0], seq[1]
        if g != gene or d != domain or seq[2] == []:
            continue
        if tax is False or id2name[name] not in tax:
            t = 'n/a'
        else:
            t = tax[id2name[name]]
        c = color2tax[t]
        for ins in seq[2]:
            family = [i for i in list(ins[-1].values()) if i != 'n/a']
            if len(family) != 1:
                family = 'n/a'
            else:
                family = family[0]
            x, y = int(ins[1]), int(ins[2])
            orf, intron = ins[-3], ins[-2]
            if orf is True:  # has orf
                if intron is True:  # has intron
                    p = 0
                else:
                    p = 1
            else:
                if intron is True:  # intron, no orf
                    p = 2
                else:
                    p = 3
            marker, size = 'o', 30
            if orf is True:
                marker, size = markers[family]
            axarr[p].scatter(x, y, \
                    edgecolors = c, marker = marker, s = size, label = family, \
                    facecolors = 'none', clip_on = False)
    # legend
    handles, labels = [], []
    for ax in axarr[0:2]:
        hs, ls = ax.get_legend_handles_labels()
        for h, l in zip(hs, ls):
            if l in labels:
                continue
            handles.append(h)
            labels.append(l)
    l1 = plt.legend(handles, labels, scatterpoints = 1, \
            prop = {'size':10}, loc = 'upper left', bbox_to_anchor = (1, 0.5))
    names = [t for t in taxa]
    boxes = [
        matplotlib.patches.Rectangle((0, 0), 1, 1, fc=color2tax[t])
        for t in taxa
    ]
    plt.legend(boxes, names, scatterpoints = 1, \
            prop = {'size':10}, loc = 'lower left', bbox_to_anchor = (1, 0.5))
    plt.gca().add_artist(l1)  # add l1 as a separate legend
    # save
    #    plt.tight_layout()
    figure = plt.gcf()
    figure.set_size_inches(12, 12)
    pdf = PdfPages('%s.%s-%srRNAgene-insertions.pdf' %
                   (fname.rsplit('.', 1)[0], domain, gene))
    pdf.savefig()
    plt.close()
    pdf.close()
Beispiel #32
0
    def plot_silhouette(self, reduced_data, **kwargs):
        #Code from http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(reduced_data) + (self.n_clusters + 1) * 10])

        # # Initialize the clusterer with n_clusters value and a random generator
        # # seed of 10 for reproducibility.
        # clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = self.clusterer.fit_predict(reduced_data)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(reduced_data, cluster_labels)
        print("For n_clusters =", self.n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(reduced_data,
                                                      cluster_labels)

        y_lower = 10
        for i in range(self.n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
             sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / self.n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhoutte score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.spectral(cluster_labels.astype(float) / self.n_clusters)
        ax2.scatter(reduced_data[:, 0],
                    reduced_data[:, 1],
                    marker='.',
                    s=30,
                    lw=0,
                    alpha=0.7,
                    c=colors)

        # Labeling the clusters
        centers = self.clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0],
                    centers[:, 1],
                    marker='o',
                    c="white",
                    alpha=1,
                    s=200)

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(
            ("Silhouette analysis for KMeans clustering on sample data "
             "with n_clusters = %d" % self.n_clusters),
            fontsize=14,
            fontweight='bold')

        if kwargs['no_display'] == True:
            plt.savefig(kwargs['img_name'])

        else:
            plt.show()
Beispiel #33
0
def compare_silhoutte_scores(dfi,
                             samples,
                             range_n_clusters,
                             cluster_dim='features'):
    """Compare silhoutte scores kmeans cluster numbers.
    Source code obtained and modified from :-
    http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

    Parameters
    ----------
    dfi : pandas dataframe
       input dataframe with features as rows and and samples as columns
    samples : list of str
       Names of samples
    range_n_clusters: list of int
      The list of cluster numbers for which the silhoutte score is to be computed.
    cluster_dim : Optional[str]
      Dimension along which data is to be clustered. Default is along features.
      To cluster samples, set cluster_dim='samples'

    Returns
    -------

    """
    df = dfi.fillna(0).copy()
    X = df[samples].values
    if cluster_dim == 'samples':
        X = X.T

    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X[:, 0],
                    X[:, 1],
                    marker='.',
                    s=30,
                    lw=0,
                    alpha=0.7,
                    c=colors,
                    edgecolor='k')

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0],
                    centers[:, 1],
                    marker='o',
                    c="white",
                    alpha=1,
                    s=200,
                    edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0],
                        c[1],
                        marker='$%d$' % i,
                        alpha=1,
                        s=50,
                        edgecolor='k')

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(
            ("Silhouette analysis for KMeans clustering on sample data "
             "with n_clusters = %d" % n_clusters),
            fontsize=14,
            fontweight='bold')

        plt.show()
Beispiel #34
0
    print n_clusters, silhouette_avg

    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(all) + (n_clusters + 1) * 10])

    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhoutte score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
Beispiel #35
0
    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(scaledX, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
                          edgecolor=color,
                          alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
Beispiel #36
0
def plot_silhouette(clf,
                    X,
                    title='Silhouette Analysis',
                    metric='euclidean',
                    copy=True,
                    ax=None):
    """Plots silhouette analysis of clusters using fit_predict.

    Args:
        clf: Clusterer instance that implements ``fit`` and ``fit_predict`` methods.

        X (array-like, shape (n_samples, n_features)):
            Data to cluster, where n_samples is the number of samples and
            n_features is the number of features.

        title (string, optional): Title of the generated plot. Defaults to "Silhouette Analysis"

        metric (string or callable, optional): The metric to use when calculating distance
            between instances in a feature array. If metric is a string, it must be one of
            the options allowed by sklearn.metrics.pairwise.pairwise_distances. If X is
            the distance array itself, use "precomputed" as the metric.

        copy (boolean, optional): Determines whether ``fit`` is used on **clf** or on a
            copy of **clf**.

        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot
            the learning curve. If None, the plot is drawn on a new set of axes.

    Returns:
        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn.

    Example:
        >>> import scikitplot.plotters as skplt
        >>> kmeans = KMeans(n_clusters=4, random_state=1)
        >>> skplt.plot_silhouette(kmeans, X)
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
        >>> plt.show()

        .. image:: _static/examples/plot_silhouette.png
           :align: center
           :alt: Silhouette Plot
    """
    if copy:
        clf = clone(clf)

    cluster_labels = clf.fit_predict(X)

    n_clusters = len(set(cluster_labels))

    silhouette_avg = silhouette_score(X, cluster_labels, metric=metric)

    sample_silhouette_values = silhouette_samples(X,
                                                  cluster_labels,
                                                  metric=metric)

    if ax is None:
        fig, ax = plt.subplots(1, 1)

    ax.set_title(title)
    ax.set_xlim([-0.1, 1])

    ax.set_ylim([0, len(X) + (n_clusters + 1) * 10 + 10])

    ax.set_xlabel('Silhouette coefficient values')
    ax.set_ylabel('Cluster label')

    y_lower = 10

    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels
                                                                 == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)

        ax.fill_betweenx(np.arange(y_lower, y_upper),
                         0,
                         ith_cluster_silhouette_values,
                         facecolor=color,
                         edgecolor=color,
                         alpha=0.7)

        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        y_lower = y_upper + 10

    ax.axvline(x=silhouette_avg,
               color="red",
               linestyle="--",
               label='Silhouette score: {0:0.3f}'.format(silhouette_avg))

    ax.set_yticks([])  # Clear the y-axis labels / ticks
    ax.set_xticks(np.arange(-0.1, 1.0, 0.2))

    ax.legend(loc='best')

    return ax
Beispiel #37
0
PEsx1_ic8 = datafile['PEsx1']
SCsx1_ic8 = datafile['SCsx1']

fileheader = 'PE_SC_DPDoubPen_LsEq1_MsEq1_g9p81_tstep001_icscanIC16_embeddelay5_999_delays'
datafile = loadnpzfile(datadir + fileheader + npz)
PEsx1_ic16 = datafile['PEsx1']
SCsx1_ic16 = datafile['SCsx1']

tmax, dt = 100, 0.001
t = np.arange(0, tmax + dt, dt)
timeindex = delayindex * 0.001

import matplotlib.cm as cm
colors = np.zeros([20, 4])
for i in np.arange(20):
    c = cm.spectral(i / 20., 1)
    colors[i, :] = c
points = ['o', 'v', 's', 'p', '*', 'h', '^', 'D', '+', '>', 'H', 'd', 'x', '<']

plt.rc('axes', linewidth=2.0)
plt.rc('xtick.major', width=2.0)
plt.rc('ytick.major', width=2.0)
plt.rc('xtick.minor', width=2.0)
plt.rc('ytick.minor', width=2.0)
plt.rc('lines', markersize=8, markeredgewidth=0.0, linewidth=2.0)
#plt.rcParams['ps.fonttype'] = 3
fig = plt.figure(num=1, figsize=(7, 6), dpi=600, facecolor='w', edgecolor='k')
left = 0.15  # the left side of the subplots of the figure
right = 0.94  # the right side of the subplots of the figure
bottom = 0.1  # the bottom of the subplots of the figure
top = 0.96  # the top of the subplots of the figure
def plot_scores_and_clusters_from_pca(X, corpus):
    """ Plotting silhouette scores and the clusters
    Adapted from http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html """
    # range_n_clusters = [2, 3]
    range_n_clusters = [2, 3, 4, 5, 6]
    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)
        fig.subplots_adjust(wspace=0.3)  # Adjust width space betweens subplots
        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.2, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
        # Initialize the clusterer with n_clusters value
        clusterer = KMeans(n_clusters=n_clusters)
        cluster_labels = clusterer.fit_predict(X)

        # Saving the data with predicted classes
        newdata = pd.DataFrame({'label' : cluster_labels, 'text' : corpus})
        newdata.to_csv('rd-pca-labeled-clusters-' + str(n_clusters) + '.csv', encoding='utf-8')
        
        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("PCA: For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)
        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]
            ith_cluster_silhouette_values.sort()
            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i
            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)
            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples
        ax1.set_title("Silhouette plot for the various clusters", fontsize=11)
        ax1.set_xlabel("Silhouette coefficient values, Avg: " + str(round(silhouette_avg, 4)), fontsize=11)
        ax1.set_ylabel("Cluster label", fontsize=11)
        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
        # 2nd Plot showing the actual clusters formed
        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                    c=colors)
        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1],
                    marker='o', c="white", alpha=1, s=200)
        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%  d$' % i, alpha=1, s=50)
        ax2.set_title("Visualization of the clustered data", fontsize=11)
        ax2.set_xlabel("Feature space for the 1st feature", fontsize=11)
        ax2.set_ylabel("Feature space for the 2nd feature", fontsize=11)
        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=12, fontweight='bold')
        plt.show(block=False)
        plt.savefig('rd-pca-clusters-' + str((n_clusters)) + '.png', dpi=600)
Beispiel #39
0
def plot_ellipses(ds, sts, x=0, y=1, ci=.95, labels=None,
                   cmap=None,scat=False,linestyle=None, nude=False,
                    axes='off', fig=None, **kwargs):

    """
    center: should be the factor scores from original compromise matrix
    points: should be factor scores from bootstrap
    """

    f = plt.figure(fig)
    ax = f.gca()
    boot = ds.samples
    i,j,k = boot.shape

    if cmap==None:
        cmap = cm.spectral(np.linspace(.2,.85,i))
    if linestyle is None:
        if ds.sa.has_key('linestyle'):
            linestyle = list(ds.sa['linestyle'])
        else:
            linestyle = ['solid']*i
    if labels is None:
        labels = list(ds.targets)


    mx = np.max(abs(boot[:,[x,y],:]))
    xmx = mx*1.2
    w = .01*xmx
    hw = .05*xmx
    #plt.plot([-mx,mx],[0,0],c = 'gray',alpha=.7, lw=2)
    #plt.plot([0,0],[-mx*.8,mx*.8],c = 'gray',alpha=.7, lw=2)
    plt.arrow(-xmx,0,2*xmx,0,color = 'gray',alpha=.7,width=w,
            head_width=hw,length_includes_head=True)
    plt.arrow(0,-mx,0,mx*2,color = 'gray',alpha=.7, width=w,
            head_width=hw,length_includes_head=True)
 
    for l in range(i):
        points = np.hstack((boot[l,x,:].reshape(-1,1),
                                boot[l,y,:].reshape(-1,1)))
        center = np.mean(points,0)
        w, rot = np.linalg.eigh(np.cov(points.T))

        # get size corresponding to level
        a = np.sqrt(w[0] * chi2.ppf(ci, 2))
        b = np.sqrt(w[1] * chi2.ppf(ci, 2))

        j = np.linspace(0,2*np.pi,128)
        coords = np.hstack((    (np.cos(j)*a).reshape((-1,1)),
                        (np.sin(j)*b).reshape((-1,1))))
        coords = np.mat(coords.dot(rot.T) + center)

        plt.plot(np.vstack((coords[:,0], coords[0,0])),
                    np.vstack((coords[:,1], coords[0,1])),
                    c=cmap[l], ls=linestyle[l], **kwargs)
        if scat:
            plt.scatter(points[:,0],points[:,1],c=cmap[l])

        if not nude:
            plt.annotate(labels[l],xy = (center[0], center[1]))
            mpl.rcParams['text.usetex'] = True

    if not nude:
        plt.text(-xmx,.05*mx,'$\lambda = %s$'%np.round(sts.eigv[x],2))
        plt.text(xmx*.05,mx*.9,'$\lambda = %s$'%np.round(sts.eigv[y],2))
        tau = '$\\tau = $'
        perc = '$\%$'
        mpl.rcParams['text.usetex'] = False
        plt.text(-xmx,-.1*mx, '%s $%s$%s' %
                (tau,np.round(100*sts.inertia[x],0),perc))
        plt.text(xmx*.05,mx*.8, '%s $%s$%s' %
                (tau,np.round(100*sts.inertia[y],0),perc))
        plt.text(-.15*xmx,.8*mx,'$%s$'%(y+1), fontsize=20)
        plt.text(xmx*.85,-mx*.2,'$%s$'%(x+1),fontsize=20)
    #plt.axis('equal')
    #plt.axis([-mx,mx,-mx,mx])
    plt.axis('scaled')
    plt.axis([-xmx,xmx,-mx,mx])
    plt.axis(axes)
    return f.number
timeindex = (delayindex*1e5)/(1e6)

PEs_1 = np.zeros([34,500])
SCs_1 = np.zeros([34,500])
for file in np.arange(len(timestep_arr)):
    fileheader = 'PE_SC_IDdatabase_Type_1_data_3000_499_delays_3227orbits_'+str(timestep_arr[file])+'_timesteps'
    datafile = loadnpzfile(datadir+fileheader+npy)
    PEs_1[file,:]=datafile['PEs']
    SCs_1[file,:]=datafile['SCs']    



ncolors=6
colors = np.zeros([ncolors,4])
for i in np.arange(ncolors):
    c = cm.spectral(i/float(ncolors),1)
    colors[i,:]=c
points = ['o','v','s','p','*','h','^','D','+','>','H','d','x','<']
        
plt.rc('axes',linewidth=2.0)
plt.rc('xtick.major',width=2.0)
plt.rc('ytick.major',width=2.0)
plt.rc('xtick.minor',width=2.0)
plt.rc('ytick.minor',width=2.0)
plt.rc('lines',markersize=2,markeredgewidth=0.0,linewidth=2.0)
#plt.rcParams['ps.fonttype'] = 42
#plt.rcParams['pdf.fonttype'] = 42

plt.rc('lines',markersize=2,markeredgewidth=0.0)
fig=plt.figure(num=1,figsize=(7,9),dpi=600,facecolor='w',edgecolor='k')
left  = 0.16  # the left side of the subplots of the figure
Beispiel #41
0
SCs600 = datafile['SCs']

fileheader = 'Data_sine700period_ranphasestart_249_delays'
datafile = loadnpzfile(datadir+fileheader+npy)
PEs700 = datafile['PEs']
SCs700 = datafile['SCs']

#fileheader = 'PE_SC_sinewave_249_delays'
#datafile = loadnpzfile(datadir+fileheader+npy)
#PEsin = datafile['PEs']
#SCsin = datafile['SCs']
"""

colors = np.zeros([5, 4])
for i in np.arange(5):
    c = cm.spectral(i / 5., 1)
    colors[i, :] = c
points = ['o', 'v', 's', 'p', '*', 'h', '^', 'D', '+', '>', 'H', 'd', 'x', '<']

plt.rc('axes', linewidth=2.0)
plt.rc('xtick.major', width=2.0)
plt.rc('ytick.major', width=2.0)
plt.rc('xtick.minor', width=2.0)
plt.rc('ytick.minor', width=2.0)
plt.rc('lines', markersize=2, markeredgewidth=0.0, linewidth=2.0)

fig = plt.figure(num=1, figsize=(7, 6), dpi=600, facecolor='w', edgecolor='k')
left = 0.15  # the left side of the subplots of the figure
right = 0.94  # the right side of the subplots of the figure
bottom = 0.1  # the bottom of the subplots of the figure
top = 0.96  # the top of the subplots of the figure
SCs850old = datafile['SCs']

fileheader = 'PE_SC_DavidData_Class_3_249_delays_900_timesteps'
datafile = loadnpzfile(datadir+fileheader+npy)
PEs900old = datafile['PEs']
SCs900old = datafile['SCs']

fileheader = 'PE_SC_DavidData_Class_3_249_delays_950_timesteps'
datafile = loadnpzfile(datadir+fileheader+npy)
PEs950old = datafile['PEs']
SCs950old = datafile['SCs']
"""

colors = np.zeros([19, 4])
for i in np.arange(19):
    c = cm.spectral(i / 19., 1)
    colors[i, :] = c
points = ['o', 'v', 's', 'p', '*', 'h', '^', 'D', '+', '>', 'H', 'd', 'x', '<']

plt.rc('axes', linewidth=0.75)
plt.rc('xtick.major', width=0.75)
plt.rc('ytick.major', width=0.75)
plt.rc('xtick.minor', width=0.75)
plt.rc('ytick.minor', width=0.75)
plt.rc('lines', markersize=2, markeredgewidth=0.0)

plt.rc('lines', markersize=1.5, markeredgewidth=0.0)
fig = plt.figure(num=1, figsize=(4, 3), dpi=300, facecolor='w', edgecolor='k')
left = 0.16  # the left side of the subplots of the figure
right = 0.94  # the right side of the subplots of the figure
bottom = 0.2  # the bottom of the subplots of the figure
Beispiel #43
0
def plot_clusters_silhouette(X,
                             cluster_labels,
                             n_clusters,
                             root='',
                             file_format='pdf'):
    """Plot the silhouette score for each cluster, given the distance matrix X.

    Parameters
    ----------
    X : array_like, shape [n_samples_a, n_samples_a]
        Distance matrix.
    cluster_labels : array_like
        List of integers which represents the cluster of the corresponding
        point in X. The size must be the same has a dimension of X.
    n_clusters : int
        The number of clusters.
    root : str, optional
        The root path for the output creation
    file_format : ('pdf', 'png')
        Choose the extension for output images.
    """
    # Create a subplot with 1 row and 2 columns
    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(20, 15)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    # ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X,
                                                  cluster_labels,
                                                  metric="precomputed")
    silhouette_avg = np.mean(sample_silhouette_values)
    logging.info("Average silhouette_score: %.4f", silhouette_avg)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
                          edgecolor=color,
                          alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        # ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("silhouette coefficient values")
    ax1.set_ylabel("cluster label")

    # The vertical line for average silhoutte score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.suptitle(("Silhouette analysis (n_clusters {}, avg score {:.4f}, "
                  "tot Igs {}".format(n_clusters, silhouette_avg, X.shape[0])),
                 fontsize=14,
                 fontweight='bold')
    filename = os.path.join(
        root, 'silhouette_analysis_{}.{}'.format(extra.get_time(),
                                                 file_format))
    fig.savefig(filename)
    logging.info('Figured saved %s', filename)
Beispiel #44
0
    """
    cluster_id = 1
    n_points = m.shape[1]
    classifications = [UNCLASSIFIED] * n_points
    for point_id in range(0, n_points):
        point = m[:,point_id]
        if classifications[point_id] == UNCLASSIFIED:
            if _expand_cluster(m, classifications, point_id, cluster_id, eps, min_points):
                cluster_id = cluster_id + 1
    return classifications

a = np.array([np.array(x),np.array(y)])
import matplotlib.pyplot as pyplot
o = dbscan(a,0.5,4)
# print(o)
unique_labels = np.array(o)
n_clusters_ = np.max(o)
# print(n_clusters_)
# core_samples_mask = np.zeros_like(o, dtype=bool)
k = n_clusters_

colors = cm.spectral(unique_labels.astype(float) / k)
pyplot.scatter(X[:, 0], X[:, 1], marker='.', s=40, lw=0, alpha=0.7,c=colors)
pyplot.title("The visualization of the clustered data.")
pyplot.xlabel("X")
pyplot.ylabel("Y")

pyplot.suptitle(("DBSCAN clustering with n_clusters = %d" % k),fontsize=14, fontweight='bold')

print("The number of clusters are ",k)
pyplot.savefig("plot11.png")
    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        axarr[k, t].fill_betweenx(np.arange(y_lower, y_upper),
                                  0,
                                  ith_cluster_silhouette_values,
                                  facecolor=color,
                                  edgecolor=color,
                                  alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        # axarr[k, t].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    axarr[k, t].set_title("K = %d , AVGS = %f" % (n_clusters, silhouette_avg),
                          font)
            # plots of individual clusters, to demarcate them clearly.
            ax1.set_ylim([0, len(X) + (k + 1) * 10])

            y_lower = 10
            for i in range(k):
                # Aggregate the silhouette scores for samples belonging to
                # cluster i, and sort them
                ith_cluster_silhouette_values = sample_silhouette_values[
                    cluster_labels == i]

                ith_cluster_silhouette_values.sort()

                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                y_upper = y_lower + size_cluster_i

                color = cm.spectral(float(i) / k)
                ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                  0,
                                  ith_cluster_silhouette_values,
                                  facecolor=color,
                                  edgecolor=color,
                                  alpha=0.7)

                # Label the silhouette plots with their cluster numbers at the middle
                ax1.text(-0.1, y_lower + 0.5 * size_cluster_i, str(i))

                # Compute the new y_lower for next plot
                y_lower = y_upper + 10  # 10 for the 0 samples

            ax1.set_title("Silhouette for k = " + str(k) +
                          "\n(average silhouette coefficient dashed red line)")
Beispiel #47
0
        np.savetxt(cfg['outName'] + '_Hz.csv', Hz)
        np.savetxt(cfg['outName'] + '_Sil.csv', Sils[base])
        np.savetxt(cfg['outName'] + '_Ns.csv', Ns[base])

    mis[base] = MI
    nmis[base] = MI / np.sqrt(Hz * Hgt)
    vMeasures[base] = 2. * MI / (Hz + Hgt)
#  print mis[base].shape, nmis[base].shape, vMeasures[base].shape
#  print mis[base]
#  print nmis[base]
#  print Ns[base]
#  print Sils[base]

print "done with the runs"

cl = cm.spectral(np.arange(255))
I = len(bases) + 1

if 'spkm' in bases and 'DpvMFmeans' in bases:
    indSpkm = np.ones(len(paramBase['spkm']), dtype=bool)
    indSpkm[Ns['spkm'].mean(axis=1) < Ns['DPvMFmeans'].min()] = False
    indSpkm[Ns['spkm'].mean(axis=1) > Ns['DPvMFmeans'].max()] = False

    paramBase['spkm'] = paramBase['spkm'][indSpkm]
    nmis['spkm'] = nmis['spkm'][indSpkm, :]
    mis['spkm'] = mis['spkm'][indSpkm, :]
    Ns['spkm'] = Ns['spkm'][indSpkm, :]
    Sils['spkm'] = Sils['spkm'][indSpkm, :]

if 'DirvMF' in bases:
    print "DirvMF NMI:        {} +- {}".format(nmis['DirvMF'].mean(),
Beispiel #48
0
def silhouette_test(X,
                    kmeans,
                    n_clusters,
                    numsegs,
                    segsize,
                    summaryonly,
                    display=False):
    print('generating cluster labels')
    cluster_labels = kmeans.predict(X)
    thesilavgs = np.zeros(numsegs, dtype='float')
    thesilclusterstats = np.zeros((numsegs, 4, n_clusters), dtype='float')
    print('calculating silhouette stats')
    for segment in range(numsegs):
        seg_X = X[segment * segsize:(segment + 1) * segsize]
        seg_cluster_labels = cluster_labels[segment * segsize:(segment + 1) *
                                            segsize]
        # do a quick sanity check to see if all the labels are present
        clusternums = np.zeros(n_clusters, dtype='int')
        for i in range(len(seg_cluster_labels)):
            clusternums[seg_cluster_labels[i]] += 1
        if np.min(clusternums) > 0:
            thesilavgs[segment] = metrics.silhouette_score(
                seg_X, seg_cluster_labels)
            print('average silhouette score for segment', segment, '=',
                  thesilavgs[segment])

            if not summaryonly:
                print('doing silhouette samples')
                sample_silhouette_values = metrics.silhouette_samples(
                    seg_X, seg_cluster_labels)
                if display:
                    # Create a subplot with 1 row and 2 columns
                    fig, (ax1) = plt.subplots(1, 1)
                    fig.set_size_inches(8, 4.5)

                    # The 1st subplot is the silhouette plot
                    # The silhouette coefficient can range from -1, 1 but in this example all
                    # lie within [-0.3, 1]
                    ax1.set_xlim([-0.3, 1])
                    # The (n_clusters+1)*10 is for inserting blank space between silhouette
                    # plots of individual clusters, to demarcate them clearly.
                    ax1.set_ylim([0, len(seg_X) + (n_clusters + 1) * 10])

                    y_lower = 10
                for i in range(n_clusters):
                    # Aggregate the silhouette scores for samples belonging to
                    # cluster i, and sort them
                    ith_cluster_silhouette_values = \
                        sample_silhouette_values[seg_cluster_labels == i]

                    ith_cluster_silhouette_values.sort()
                    thesilclusterstats[segment, 0, i] = np.mean(
                        ith_cluster_silhouette_values)
                    thesilclusterstats[segment, 1, i] = np.median(
                        ith_cluster_silhouette_values)
                    thesilclusterstats[segment, 2,
                                       i] = ith_cluster_silhouette_values[0]
                    thesilclusterstats[segment, 3,
                                       i] = ith_cluster_silhouette_values[-1]

                    size_cluster_i = ith_cluster_silhouette_values.shape[0]

                    if display:
                        y_upper = y_lower + size_cluster_i
                        color = cm.spectral(float(i) / n_clusters)
                        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                          0,
                                          ith_cluster_silhouette_values,
                                          facecolor=color,
                                          edgecolor=color,
                                          alpha=0.7)

                        # Label the silhouette plots with their cluster numbers at the middle
                        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

                        # Compute the new y_lower for next plot
                        y_lower = y_upper + 10  # 10 for the 0 samples

                if display:
                    ax1.set_title(
                        "The silhouette plot for the various clusters.")
                    ax1.set_xlabel("The silhouette coefficient values")
                    ax1.set_ylabel("Cluster label")

                    # The vertical line for average silhouette score of all the values
                    ax1.axvline(x=thesilavgs[segment],
                                color="red",
                                linestyle="--")

                    ax1.set_yticks([])  # Clear the yaxis labels / ticks
                    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
                    plt.suptitle((
                        "Silhouette analysis for KMeans clustering on sample data "
                        "with n_clusters = %d" % n_clusters),
                                 fontsize=14,
                                 fontweight='bold')

                    plt.show()
        else:
            print('states are not fully populated - skipping stats')
    return thesilavgs, thesilclusterstats
def computeSilhouette(appMatrixFile):
	## Generating the sample data from make_blobs
	## This particular setting has one distict cluster and 3 clusters placed close
	## together.
#	 X, y = make_blobs(n_samples=10,
#					   n_features=2,
#					   centers=4,
#					   cluster_std=1,
#					   center_box=(-10.0, 10.0),
#					   shuffle=True,
#					   random_state=1)  # For reproducibility
#	 print(X.shape)
#	 print(y)

	appMatrix = cPickle.load(open(appMatrixFile, 'rb'))
	newAppMatrix = np.array(appMatrix)
	'''
	sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean', n_jobs=1, **kwds)
	We will now compute the pairwise distance metric for our input array.
	The distance metric options are:-
	From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs.
	From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 
	'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
	See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs.
	'''
	X = pairwise_distances(newAppMatrix, metric='manhattan', n_jobs=4)
   #
#	 print(X.shape)
	#print(X.shape[0])
	#
	#for listVec in X:
	#	print(listVec)

	startingNumberOfClusters = 2
	endingNumberOfClusters = 6

	for n_clusters in range(startingNumberOfClusters,endingNumberOfClusters):
		# Create a subplot with 1 row and 2 columns
		fig, (ax1, ax2) = plt.subplots(1, 2)
		fig.set_size_inches(18, 7)
	
		# The 1st subplot is the silhouette plot
		# The silhouette coefficient can range from -1, 1 but in this example all
		# lie within [-0.1, 1]
		ax1.set_xlim([-0.1, 1])
		# The (n_clusters+1)*10 is for inserting blank space between silhouette
		# plots of individual clusters, to demarcate them clearly.
		ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

		# Initialize the clusterer with n_clusters value and a random generator
		# seed of 10 for reproducibility.
		clusterer = KMeans(n_clusters=n_clusters, random_state=10)
		cluster_labels = clusterer.fit_predict(X)
#		 print(cluster_labels)
#		 print(cluster_labels.shape)
		
		# The silhouette_score gives the average value for all the samples.
		# This gives a perspective into the density and separation of the formed
		# clusters
		silhouette_avg = silhouette_score(X, cluster_labels, metric='euclidean')
		logging.debug('For n_clusters ='+n_clusters+'The average silhouette_score is :'+silhouette_avg)

		# Compute the silhouette scores for each sample
		sample_silhouette_values = silhouette_samples(X, cluster_labels, metric='euclidean')
	
		y_lower = 10
		for i in range(n_clusters):
			# Aggregate the silhouette scores for samples belonging to
			# cluster i, and sort them
			ith_cluster_silhouette_values = \
				sample_silhouette_values[cluster_labels == i]
	
			ith_cluster_silhouette_values.sort()
	
			size_cluster_i = ith_cluster_silhouette_values.shape[0]
			y_upper = y_lower + size_cluster_i
	
			color = cm.spectral(float(i) / n_clusters)
			ax1.fill_betweenx(np.arange(y_lower, y_upper),
							  0, ith_cluster_silhouette_values,
							  facecolor=color, edgecolor=color, alpha=0.7)
	
			# Label the silhouette plots with their cluster numbers at the middle
			ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
	
			# Compute the new y_lower for next plot
			y_lower = y_upper + 10  # 10 for the 0 samples
	
		ax1.set_title("The silhouette plot for the various clusters.")
		ax1.set_xlabel("The silhouette coefficient values")
		ax1.set_ylabel("Cluster label")
	
		# The vertical line for average silhoutte score of all the values
		ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
	
		ax1.set_yticks([])  # Clear the yaxis labels / ticks
		ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
	
		# 2nd Plot showing the actual clusters formed
		colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
		ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
					c=colors)
	
		# Labeling the clusters
		centers = clusterer.cluster_centers_
		# Draw white circles at cluster centers
		ax2.scatter(centers[:, 0], centers[:, 1],
					marker='o', c="white", alpha=1, s=200)
	
		for i, c in enumerate(centers):
			ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)
	
		ax2.set_title("The visualization of the clustered data.")
		ax2.set_xlabel("Feature space for the 1st feature")
		ax2.set_ylabel("Feature space for the 2nd feature")
	
		plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
					  "with n_clusters = %d" % n_clusters),
					 fontsize=14, fontweight='bold')
	
		plt.show()
Beispiel #50
0
import matplotlib.cm as cm
import numpy as np


dataset = pd.read_csv("dataset2.txt", header = None,delim_whitespace=True)


X = np.array(dataset[0:dataset.columns[0]-1])
y =np.array(dataset[dataset.columns[dataset.shape[1]-1]])


       
k = 3
clusterer = KMeans(n_clusters=k, random_state=10)
cluster_labels = clusterer.fit_predict(X)
colors = cm.spectral(cluster_labels.astype(float) / k)
plt.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,c=colors)

centers = clusterer.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1],marker='o', c="white", alpha=1, s=200)

for i, c in enumerate(centers):
    plt.scatter(c[0], c[1], marker='$%d$' % (i+1), alpha=1, s=50)

plt.title("The visualization of the clustered data.")
plt.xlabel("X")
plt.ylabel("Y")

plt.suptitle(("KMeans clustering with n_clusters = %d" % k),fontsize=14, fontweight='bold')

plt.savefig("plot2.png")
def visualize(df, cluster_labels, n_clusters, n_iterations):
    """ Visualize the points in a n-dimensional space and the silhouette for each cluster"""
    # Dimension for visualization
    target_dimension = 2
    cluster_labels = np.array(cluster_labels)
    mds = manifold.MDS(target_dimension, max_iter=100, n_init=1)
    X = mds.fit_transform(df)

    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    ax1.set_xlim([-0.1, 1])

    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(df, cluster_labels)

    y_lower = 10
    num_elements = len(cluster_labels)
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them

        ith_cluster_silhouette_values = np.array([
            sample_silhouette_values[k] for k in range(num_elements)
            if cluster_labels[k] == i
        ])

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
                          edgecolor=color,
                          alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    # ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X[:, 0],
                X[:, 1],
                marker='.',
                s=30,
                lw=0,
                alpha=0.7,
                c=colors,
                edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = {} and n_iterations = {}").format(
                      n_clusters, n_iterations + 1),
                 fontsize=14,
                 fontweight='bold')

    plt.show()
Beispiel #52
0
def Cluster(X):
    range_n_clusters = [2]

    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhoutte score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X[:, 0],
                    X[:, 1],
                    marker='.',
                    s=30,
                    lw=0,
                    alpha=0.7,
                    c=colors)

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0],
                    centers[:, 1],
                    marker='o',
                    c="white",
                    alpha=1,
                    s=200)

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("1st principal component")
        ax2.set_ylabel("2nd principal component")

        plt.suptitle(
            ("Silhouette analysis for KMeans clustering on pca scores "
             "with n_clusters = %d" % n_clusters),
            fontsize=14,
            fontweight='bold')

        plt.show()
        return cluster_labels.astype(int)
vz_SC = datafile['SCs'][1:]

#datadir = 'C:\\Users\\dschaffner\\OneDrive - brynmawr.edu\\Corrsin Wind Tunnel Data\\NPZ_files\\m50_5mm\\streamwise\\'
#fileheader = 'PE_SC_m50_5mm_embed6'
#datafile = loadnpzfile(datadir+fileheader+npy)
#PEs2 = datafile['PEs']
#SCs2 = datafile['SCs']

#PEs2 = np.mean(PEs2,axis=1)
#SCs2 = np.mean(SCs2,axis=1)

#taus = datafile['taus']

colors = np.zeros([7,4])
for i in np.arange(7):
    c = cm.spectral(i/7.,1)
    colors[i,:]=c
points = ['o','v','s','p','*','h','^','D','+','>','H','d','x','<']
        
plt.rc('axes',linewidth=0.75)
plt.rc('xtick.major',width=0.75)
plt.rc('ytick.major',width=0.75)
plt.rc('xtick.minor',width=0.75)
plt.rc('ytick.minor',width=0.75)
plt.rc('lines',markersize=2,markeredgewidth=0.0)

plt.rc('lines',markersize=1.5,markeredgewidth=0.0)
fig=plt.figure(num=1,figsize=(3.5,3.5),dpi=300,facecolor='w',edgecolor='k')
left  = 0.2  # the left side of the subplots of the figure
right = 0.94    # the right side of the subplots of the figure
bottom = 0.2  # the bottom of the subplots of the figure
def cluster_crimes_k_means(_data, _grid, _k, _n, _plot=True):
    if _n > 128:
        _n = 128
        print("n was too big. Set to 128.")
    if _k**2 != len(_grid.index):
        raise ValueError('parameter k and number of cells in grid does not match')
    #initialize crime scenes as matrix
    _points = _data[['X', 'Y']].as_matrix()
    #initialize cell centroids as matrix
    _init_center = _grid[["centroid_x", "centroid_y"]].as_matrix()

    print("K-Means started...")
    _clusterer = KMeans(n_clusters=(_k**2), init = _init_center, random_state=101, n_jobs = _n)
    _labels = _clusterer.fit_predict(_points)
    
    _data['cluster'] = _labels
    _grid_clustered_list = []
    for _i in range((_k**2)):
        _grid_clustered_list.append(MultiPoint(list(_data.geometry[_data.cluster == _i])).convex_hull)
    _grid_clustered = gpd.GeoDataFrame(_grid_clustered_list, columns = ['geometry']).set_geometry('geometry')

    
    
    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    _silhouette_avg = silhouette_score(_points, _labels)
    print("For k =", _k)
    print("The average silhouette_score is :", _silhouette_avg)

    if (_plot):
        # Create a subplot with 1 row and 2 columns
        _fig, (_ax1, _ax2) = plt.subplots(1, 2)
        _fig.set_size_inches(36, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        _ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        _ax1.set_ylim([0, len(_points) + ((_k**2) + 1) * 10])


        # Compute the silhouette scores for each sample
        _sample_silhouette_values = silhouette_samples(_points, _labels)
        _y_lower = 10
        for i in range((_k**2)):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            _ith_cluster_silhouette_values = _sample_silhouette_values[_labels == i]

            _ith_cluster_silhouette_values.sort()

            _size_cluster_i = _ith_cluster_silhouette_values.shape[0]
            _y_upper = _y_lower + _size_cluster_i

            _color = cm.spectral(float(i) / (_k**2))
            _ax1.fill_betweenx(np.arange(_y_lower, _y_upper),
                                  0, _ith_cluster_silhouette_values,
            facecolor = _color, edgecolor = _color, alpha = 0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            _ax1.text(-0.05, _y_lower + 0.5 * _size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            _y_lower = _y_upper + 10  # 10 for the 0 samples

        _ax1.set_title("The silhouette plot for {} clusters.".format((_k**2)))
        _ax1.set_xlabel("The silhouette coefficient values")
        _ax1.set_ylabel("Cluster label")

        # The vertical line for average silhoutte score of all the values
        _ax1.axvline(x=_silhouette_avg, color="red", linestyle="--")

        _ax1.set_yticks([])  # Clear the yaxis labels / ticks
        _ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        _colors = cm.spectral(_labels.astype(float) / (_k**2))
        _ax2.scatter(_points[:, 0], _points[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=_colors)

        # Labeling the clusters
        _centers = _clusterer.cluster_centers_
        # Draw white circles at cluster centers
        _ax2.scatter(_centers[:, 0], _centers[:, 1],
                marker='o', c="white", alpha=1, s=200)

        for i, c in enumerate(_centers):
            _ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

        _ax2.set_title("The visualization of the clustered data.")
        _ax2.set_xlabel("X")
        _ax2.set_ylabel("Y")

        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with k = %d" % _k),
                 fontsize=14, fontweight='bold')

        plt.show()

    return _grid_clustered, _labels
def silhouette_analyze(dataframe, cluster_type='KMeans', n_clusters=None):
    """
    Plot silhouette analysis plot of given data and cluster type across different  cluster sizes
    """
    # Use clustering algorithms from here
    # http://scikit-learn.org/stable/modules/clustering.html#clustering
    # And add a plot that visually plotter.shows the effectiveness of the clusters/clustering rule.(may be
    # coloured area plots ??)
    from sklearn.metrics import silhouette_samples, silhouette_score

    import matplotlib.cm as cm
    import numpy as np
    import collections
    if not n_clusters:
        n_clusters = range(2, 8, 2)
    assert isinstance(
        n_clusters,
        collections.Iterable), "n_clusters must be an iterable object"
    dataframe = dataframe.as_matrix()
    cluster_scores_df = pd.DataFrame(
        columns=['cluster_size', 'silhouette_score'])
    # Silhouette analysis --
    #       http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
    #TODO: Add more clustering methods/types like say dbscan and others

    for j, cluster in enumerate(n_clusters):
        clusterer = utils.get_model_obj(cluster_type, n_clusters=cluster)
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.

        #ax1.set_ylim([0, len(dataframe) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        cluster_labels = clusterer.fit_predict(dataframe)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        if len(cluster_labels) > 1:
            silhouette_avg = silhouette_score(dataframe, cluster_labels)
            cluster_scores_df.loc[j] = [cluster, silhouette_avg]
            print("For clusters =", cluster,
                  "The average silhouette_score is :", silhouette_avg)

            # Compute the silhouette scores for each sample
            sample_silhouette_values = silhouette_samples(
                dataframe, cluster_labels)

            y_lower = 10
            for i in range(cluster):
                # Aggregate the silhouette scores for samples belonging to
                # cluster i, and sort them
                ith_cluster_silhouette_values = \
                    sample_silhouette_values[cluster_labels == i]

                ith_cluster_silhouette_values.sort()

                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                y_upper = y_lower + size_cluster_i

                color = cm.spectral(float(i) / len(n_clusters))
                ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                  0,
                                  ith_cluster_silhouette_values,
                                  facecolor=color,
                                  edgecolor=color,
                                  alpha=0.7)

                # Label the silhouette plots with their cluster numbers at the middle
                ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

                # Compute the new y_lower for next plot
                y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhoutte score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.spectral(cluster_labels.astype(float) / cluster)
        ax2.scatter(dataframe[:, 0],
                    dataframe[:, 1],
                    marker='.',
                    s=30,
                    lw=0,
                    alpha=0.7,
                    c=colors)

        if hasattr(clusterer, 'cluster_centers_'):
            # Labeling the clusters
            centers = clusterer.cluster_centers_
            # Draw white circles at cluster centers
            ax2.scatter(centers[:, 0],
                        centers[:, 1],
                        marker='o',
                        c="white",
                        alpha=1,
                        s=200)

            for i, c in enumerate(centers):
                ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(("Silhouette analysis for %s clustering on sample data "
                      "with clusters = %d" % (cluster_type, cluster)),
                     fontsize=14,
                     fontweight='bold')
        plt.show()

    plotter.lineplot(cluster_scores_df,
                     xcol='cluster_size',
                     ycol='silhouette_score')
Beispiel #56
0
def plot_kmeans(X_data, X_2d, two_d_transformer):
    from sklearn import mixture

    range_n_clusters = [ 20]

    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X_data) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X_data)

        g = mixture.GMM(n_components=n_clusters)
        gmm_clusters =  g.fit_predict(X_data)
        cluster_labels = gmm_clusters

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X_data, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X_data, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhoutte score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X_2d[:, 0], X_2d[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                    c=colors)

        # Labeling the clusters
        centers = two_d_transformer.transform(clusterer.cluster_centers_)
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1],
                    marker='o', c="white", alpha=1, s=200)

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=14, fontweight='bold')
        bin_count = np.bincount(cluster_labels)
        parties_in_cluster = np.bincount(train.labels[cluster_labels == 0].astype(np.int64))

        plt.show()
Beispiel #57
0
    def silhouette(self, range_n_clusters, cluster_labelss):
        X = self.ndf
        for n_cluster in range_n_clusters:
            fig, (ax1, ax2) = plt.subplots(1, 2)
            fig.set_size_inches(12, 6)

            ax1.set_xlim([-0.1, 1])
            ax1.set_ylim([0, len(X) + (n_cluster + 1) * 10])

            cluster_labels = cluster_labelss[n_cluster - 2]

            # categories, cluster_labels, cluster_centers_, summary = self.kmeans_fit_predict(n_cluster, preproc)

            silhouette_avg = silhouette_score(X, cluster_labels)
            print("For n_clusters =", n_cluster,
                  "The average silhouette_score is :", silhouette_avg)

            # Compute the silhouette scores for each sample
            sample_silhouette_values = silhouette_samples(X, cluster_labels)

            y_lower = 10
            for i in range(n_cluster):
                ith_cluster_silhouette_values = \
                    sample_silhouette_values[cluster_labels == i]

                ith_cluster_silhouette_values.sort()

                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                y_upper = y_lower + size_cluster_i

                color = cm.spectral(float(i) / n_cluster)
                ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                  0,
                                  ith_cluster_silhouette_values,
                                  facecolor=color,
                                  edgecolor=color,
                                  alpha=0.7)

                ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

                y_lower = y_upper + 10  # 10 for the 0 samples

            ax1.set_title("The silhouette plot for the various clusters.")
            ax1.set_xlabel("The silhouette coefficient values")
            ax1.set_ylabel("Cluster label")

            # The vertical line for average silhouette score of all the values
            ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

            ax1.set_yticks([])  # Clear the yaxis labels / ticks
            ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

            # mds
            # mds
            similarities = euclidean_distances(X)
            mds = manifold.MDS(n_components=2,
                               max_iter=3000,
                               eps=1e-9,
                               random_state=random_state,
                               dissimilarity="precomputed",
                               n_jobs=1)
            pos = mds.fit(similarities).embedding_
            df_pos = pd.DataFrame(pos, columns=["comp1", "comp2"])
            df_pos["pred"] = cluster_labels

            for i in range(n_cluster):
                color = cm.spectral(float(i) / n_cluster)
                ax2.scatter(df_pos[df_pos["pred"] == i].iloc[:, 0],
                            df_pos[df_pos["pred"] == i].iloc[:, 1],
                            c=color)

            ax2.set_title("The visualization of the clustered data.")
            ax2.set_xlabel("Feature space for the 1st MDS feature")
            ax2.set_ylabel("Feature space for the 2nd MDS feature")

            plt.suptitle(
                ("Silhouette analysis for KMeans clustering on sample data "
                 "with n_clusters = %d" % n_cluster),
                fontsize=14,
                fontweight='bold')
            # end mds
            plt.show()
Beispiel #58
0
fig = pl.figure()
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.

distance = euclidean_distances(k_means_cluster_centers,
                               mbk_means_cluster_centers,
                               squared=True)
order = distance.argmin(axis=1)

# KMeans
ax = fig.add_subplot(1, 3, 1)
for k in range(n_clusters):
	col = cm.spectral(float(k) / n_clusters, 1)
	my_members = k_means_labels == k
	cluster_center = k_means_cluster_centers[k]
	ax.plot(X[my_members, 0], X[my_members, 1], 'w',markerfacecolor=col, marker='.')
	ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,markeredgecolor='k', markersize=6)
ax.set_title('KMeans')
pl.text(-3.5, 2.7,  'train time: %.2fs' % t_batch)

# MiniBatchKMeans
ax = fig.add_subplot(1, 3, 2)
for k in range(n_clusters):
	col = cm.spectral(float(k) / n_clusters, 1)
	my_members = mbk_means_labels == order[k]
	cluster_center = mbk_means_cluster_centers[order[k]]
	ax.plot(X[my_members, 0], X[my_members, 1], 'w',markerfacecolor=col, marker='.')
	ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,markeredgecolor='k', markersize=6)
Beispiel #59
0
def run_tsne(features_file, colors_file, output_prefix
             , filter_sample=[]
             , filter_cluster=[]
             , lst=[]
             , draw_per = 1.0
             , iter = 1000
             , perplexity = 50):
    # read data
    data_df = pd.read_table(features_file, header=None)
    cluster_colors = pd.read_table(colors_file, header=None)
    print(data_df.head())

    # make dataframe pretty
    cluster_colors = cluster_colors.rename(columns={1:'color'})
    cluster_colors["color"] = [int(extract_num.findall(str(x))[0]) for x in cluster_colors["color"].tolist()]
    print(cluster_colors.head())
    #cluster_colors = cluster_colors.rename(columns={0:0})

    # filter by samples
    if len(filter_sample) > 0:
        filter1 = []
        for x in cluster_colors[0].tolist():
            for it in filter_sample:
                st = "sample" + it + "-"
                if x.startswith(st):
                    filter1.append(x)
        cluster_colors = cluster_colors[cluster_colors[0].isin(filter1)]

    # filter by percent
    if draw_per < 1:
        clusters = divide_by_cluster(cluster_colors[0].tolist(), cluster_colors["color"].tolist())
        filter2 = take_first_per(clusters, lst)
        s = set(filter2)
        lst_new = []
        for n in lst:
            for x in cluster_colors[0].tolist():
                if x.startswith(n):
                    print x
                    lst_new.append(x)
                    if x not in s:
                        filter2.append(x)
        lst = lst_new
        cluster_colors = cluster_colors[cluster_colors[0].isin(filter2)]


    # merge data
    mapped = pd.merge(cluster_colors, data_df, on=0)

    # filter by length
    mapped["length"] = [int(x.split("_")[3]) for x in mapped[0].tolist()]
    mapped = mapped[mapped["length"] > 2000]
    print(mapped)

    # normalize like in CONCOCT
    data = mapped.as_matrix(columns=mapped.columns[2:-1])

    v = (1.0/mapped["length"]).as_matrix()[:, np.newaxis]
    data = data + v
    along_Y = np.apply_along_axis(sum, 0, data)
    data = data/along_Y[None, :]
    along_X = np.apply_along_axis(sum, 1, data)
    data = data/along_X[:, None]
    data = np.log(data)
    #print(data)

    embedding_array = bhtsne.run_bh_tsne(data, initial_dims=data.shape[1], perplexity=perplexity, max_iter=iter)
    mapped["x"] = embedding_array[:, 0]
    mapped["y"] = embedding_array[:, 1]

    # draw result of TSNE on scatter plot

    pp = PdfPages(output_prefix)


    # filter clusters to show
    fc = filter_cluster
    if len(fc) > 0:
        filtered = mapped[mapped["color"].isin(fc)]
        #mapped = filtered
    else:
        filtered = mapped

    fig = pyplot.figure()

    # draw scatter plot
    color = mapped["color"].tolist()
    mx_color = max(color)
    pyplot.scatter(mapped["x"].tolist(), mapped["y"].tolist(), c=[cm.spectral(float(i) /mx_color) for i in color])

    # make a legend for specific clusters
    # find cluster centers
    x = filtered["x"].tolist()
    y = filtered["y"].tolist()
    mp = divide_by_color(x, y, filtered["color"].tolist())
    points, names = find_cluster_centers(mp)
    patches = []
    dcolors = list(set(color))
    for c in dcolors:
        if c in fc and len(fc) < 5:
            patches.append(mpatches.Patch(color=cm.spectral(float(c)/mx_color), label='C-'+ str(c)))
    pyplot.legend(handles=patches)
    draw_points(points, names, fig)

    # mark specific points
    filtered = mapped[mapped[0].isin(lst)]
    pyplot.scatter(filtered["x"].tolist(), filtered["y"].tolist(), marker="p", edgecolors='black', c=[cm.spectral(float(i) /mx_color) for i in filtered["color"].tolist()])


    pyplot.title('Perp = '+ str(perplexity)+ ' Iter = ' + str(iter))
    pp.savefig()

    pp.close()