def quick_shift(data, tau, window_type, bandwidth, metric):
    """Perform medoid shiftclustering of data with corresponding parameters.

    Parameters
    ----------
    data : array-like, shape=[n_samples, n_features]
        Input points.

    tau : float
        Threshold parameter. Distance should not be over tau so that two points
        may be connected to each other.

    window_type : string
        Type of window to compute the weights matrix. Can be
        "flat" or "normal".

    bandwidth : float
        Value of the bandwidth for the window.

    metric : string
        Metric used to compute the distance. See pairwise_distances doc to
        look at all the possible values.

    Returns
    -------
    cluster_centers : array, shape=[n_clusters, n_features]
        Coordinates of cluster centers.

    labels : array, shape=[n_samples]
        Cluster labels for each point.

    cluster_centers_idx : array, shape=[n_clusters]
        Index in data of cluster centers.
    """

    if tau is None:
        tau = estimate_bandwidth(data)
    if bandwidth is None:
        bandwidth = estimate_bandwidth(data)

    medoids, cluster_centers_idx = compute_stationary_medoids(data, tau,
                                                              window_type,
                                                              bandwidth,
                                                              metric)
    cluster_centers = data[cluster_centers_idx]
    labels = []
    labels_val = {}
    lab = 0
    for i in cluster_centers_idx:
        labels_val[i] = lab
        lab += 1
    for i in range(len(data)):
        next_med = medoids[i]
        while next_med not in cluster_centers_idx:
            next_med = medoids[next_med]
        labels.append(labels_val[next_med])
    return cluster_centers, np.asarray(labels), cluster_centers_idx
    def cluster_pixels_ms(self):
        # reshape
        """
        cluster points descriptors by meahs shift
        :type self: ColorRemover
        """
        fg_pixels = self.img.fg_pixels.keys()
        descriptors = []
        for r, c in fg_pixels:
            descriptors.append(self.descriptor_map[r][c])
        descriptors = np.array(descriptors)
        descriptors = PCA(n_components=int(VECTOR_DIMENSION)/2).fit_transform(descriptors)
        # descriptors = self.descriptor_map.reshape(descriptors_rows, 1, VECTOR_DIMENSION)
        bandwidth = estimate_bandwidth(descriptors, quantile=0.05)
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        ms.fit(descriptors)
        labels = ms.labels_

        for i in range(len(labels)):
            xy = fg_pixels[i]
            label = labels[i]
            self.labels_map.itemset(xy, label)
        # save the indices and BGR values of each cluster as a dictionary with keys of label
        for label in range(K):
            self.pixels_of_hough_line_in_sphere[label] = map(tuple, np.argwhere((self.labels_map == label)))
            self.cluster_bgr[label] = map(tuple, self.img.bgr[self.labels_map == label])
Example #3
0
def cluster_data(df_story, algo='kmeans', params='{}'):
    print "[EDEN I/O -- cluster_data] algo: ", algo

    start = time.time()
    params = ast.literal_eval(params)

    if algo in ['gac', 'gactemporal']:
        model = algo_select(algo, params)
        model.fit(df_story)

    elif algo == 'meanshift':
        vsm = recon_vsm(df_story['vsm'])
        params['bandwidth'] = estimate_bandwidth(vsm, n_samples=200)
        model = algo_select(algo, params)
        model.fit(vsm)
    else:
        vsm = recon_vsm(df_story['vsm'])
        model = algo_select(algo, params)
        model.fit(vsm)

    # print "[EDEN I/O -- cluster_data.py] plot: cluster counts"
    # plot_cluster_counts(model, "Cluster counts using algorithm: " + str(algo))

    # print "[EDEN I/O -- cluster_data] model: ", model

    end = time.time()
    print "[EDEN I/O -- cluster_data.py] Total elapsed time: ", end - start

    return model
Example #4
0
    def _fit_mean_shift(self, x):
        for c in xrange(len(self.crange)):
            quant = 0.015 * (c + 1)
            for r in xrange(self.repeats):
                bandwidth = estimate_bandwidth(
                    x, quantile=quant, random_state=r)
                idx = c * self.repeats + r
                model = MeanShift(
                    bandwidth=bandwidth, bin_seeding=True)
                model.fit(x)
                self._labels[idx] = model.labels_
                self._parameters[idx] = model.cluster_centers_

                # build equivalent gmm
                k = model.cluster_centers_.shape[0]
                model_gmm = GMM(n_components=k, covariance_type=self.cvtype,
                                init_params='c', n_iter=0)
                model_gmm.means_ = model.cluster_centers_
                model_gmm.weights_ = sp.array(
                    [(model.labels_ == i).sum() for i in xrange(k)])
                model_gmm.fit(x)

                # evaluate goodness of fit
                self._ll[idx] = model_gmm.score(x).sum()
                if self.gof_type == 'aic':
                    self._gof[idx] = model_gmm.aic(x)
                if self.gof_type == 'bic':
                    self._gof[idx] = model_gmm.bic(x)

                print quant, k, self._gof[idx]
def ms_algo(X, bandwidth=None):
    if(bandwidth==None):
        n_samples = X.shape[0]
        bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=n_samples)

    # Apply the meanshit algorithm from sklearn library
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)

    # collect from the meanshift algorithm the labels and the centers of the clusters
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_


    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique) #Number of clusters

    # Print section
    print("The number of clusters is: %d" % n_clusters_)

    print("The centers are:")
    for i in range(n_clusters_):
        print i,
        print cluster_centers[i]

    return cluster_centers    
Example #6
0
    def test_estimate_bandwidth(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.cluster.estimate_bandwidth(random_state=self.random_state)
        expected = cluster.estimate_bandwidth(iris.data, random_state=self.random_state)
        self.assertEqual(result, expected)
Example #7
0
def applyMeanShift(data,quantileValue=0.2,clusterall=False):
	result=[]
	n_samples=len(data)
	print "Nombre de points du dataset: %d" %n_samples
	
	bandwidth = estimate_bandwidth(data, quantile=quantileValue)
	ms = MeanShift(bandwidth=bandwidth,cluster_all=clusterall)
	#Applique le MeanShift
	clustereddata=ms.fit(data)
	clusteredlabels= clustereddata.labels_
	barycenters=ms.cluster_centers_

	labels_unique = np.unique(clusteredlabels)
	nbOfClusters = len(labels_unique)

	print "number of estimated clusters : %d" % nbOfClusters

	for i in labels_unique:
		print "###Indices des points du cluster %d : ###" %i
		# print [indice[0] for indice in np.argwhere(clusteredlabels == i)]
		result.append([indice[0] for indice in np.argwhere(clusteredlabels == i)])
	#Add a zero coordinates vector to takeinto account the fact that -1 "cluster" does not have a barycenter
	if -1 in labels_unique:
		barycenters= np.append([[0 for k in range(len(barycenters[0]))]],barycenters,axis=0)

	return [result,barycenters]
def meanShift(flat_image):
    # Estimate Bandwidth
    bandwidth = estimate_bandwidth(flat_image, quantile = 0.2, n_samples=500)
    ms = MeanShift(bandwidth, bin_seeding=True)
    ms.fit(flat_image)
    labels = ms.labels_
    return ms.labels_, ms.cluster_centers_
Example #9
0
def clusterise_data(data_obj):
    """ Assigns a cluster label to each days present in the data received 
        using three different algorithms: MeanShift, Affinity Propagation, 
        or KMeans. 
        @param data_obj: List of dictionaries
    """
    L = len(data_obj)
    
    #Simply converts data_obj to a 2D list for computation
    List2D = [[None for _ in range(4)] for _ in range(L-1)]
    for i in range(L-1): #don't include current day
        #wake_up and sleep_duration are the most important factors
        List2D[i][0] = 5 * data_obj[i]["wake_up"]
        List2D[i][1] = 1 * data_obj[i]["sleep"]
        List2D[i][2] = 5 * data_obj[i]["sleep_duration"]
        List2D[i][3] = 0.5 * data_obj[i]["activity"]
    points = NumpyArray(List2D) #converts 2D list to numpyarray
        
    if ALGO == "Affinity Propagation":
        labels = AffinityPropagation().fit_predict(points)
    elif ALGO == "KMeans":
        labels= KMeans(init='k-means++', n_clusters=5, n_init=10)   .fit_predict(points)
    elif ALGO == "MeanShift":
        bandwidth = estimate_bandwidth(points, quantile=0.2, n_samples=20)
        labels = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(points)
    else:
        raise Exception("Algorithm not defined: "+str(ALGO))
        
    for i in range(L-1):
        data_obj[i]["cluster"] = labels[i]
    for unique_label in remove_duplicates(labels):
        debug_print(ALGO+": Cluster "+str(unique_label)+" contains "+str(labels.tolist().count(unique_label))+" data points")
    debug_print(ALGO+": Silhouette coefficient"+ str(metrics.silhouette_score(points, labels, metric='euclidean')*100)+"%")
Example #10
0
def call_kmean(num_cluster, data, update_flag):
    X = StandardScaler().fit_transform(data)
    bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
    two_means =  MiniBatchKMeans( n_clusters=num_cluster)
    labels = two_means.fit(X).labels_.astype(np.int)

    # if user upload files
    if update_flag:
        return labels


    label_dict = {}
    label_dict_count = 0
    for label in labels:
       label_dict[str(label_dict_count)] = float(label)
       label_dict_count = label_dict_count + 1
    print label_dict

    unique_dict = {}
    unique_dict_count = 0
    for uniq in np.unique(labels):
       print uniq
       unique_dict[str(unique_dict_count)] = float(uniq)
       unique_dict_count = unique_dict_count + 1
    print unique_dict

    return label_dict, unique_dict
def do_meanshift(s_path, band1, band2, band3, band4, colour1, colour2,
                 make_plot):
    '''Meanshift clustering to determine the number of clusters in the
        data, which is passed to KMEANS function'''
    # Truncate data
    X = np.vstack([colour1, colour2]).T
    '''Compute clustering with MeanShift'''
    # Scale data because meanshift generates circular clusters
    X_scaled = preprocessing.scale(X)
    # The following bandwidth can be automatically detected using
    # the routine estimate_bandwidth(X). Bandwidth can also be set manually.
    bandwidth = estimate_bandwidth(X)
    #bandwidth = 0.65
    # Meanshift clustering
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
    ms.fit(X_scaled)
    labels_unique = np.unique(ms.labels_)

    objects = ms.labels_[ms.labels_ >= 0]
    n_clusters = len(labels_unique[labels_unique >= 0])
    # Make plot
    if "meanshift" in make_plot:
        make_ms_plots(s_path, colour1, colour2, n_clusters, X, ms,
                      band1, band2, band3, band4, objects)
    return(n_clusters, bandwidth)
Example #12
0
def BA_meanshift_cluster(mark, chrom):
    '''
    @param:
    @return:
    perform mean shift cluster on 2D data:
        ((chromStart+chromEnd)*0.5, chromEnd-chromStart)
    '''
    path = os.path.join(get_data_dir(), "tmp", mark,"{0}-{1}.csv".format(chrom, mark))
    DF = pd.read_csv(path, sep='\t')
    S_x = 0.5*(DF.loc[:, 'chromEnd'].values+DF.loc[:, 'chromStart'].values)
    S_y = DF.loc[:, 'chromEnd'].values-DF.loc[:, 'chromStart'].values
    X = np.hstack((np.atleast_2d(S_x[7000:8000]).T, np.atleast_2d(S_y[7000:8000]).T))
    print X
    bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=1000)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    print list(set(labels))
    import matplotlib.pyplot as plt
    from itertools import cycle
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(len(list(set(labels)))), colors):
        my_members = labels == k
        plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.title('Estimated number of clusters: %d' % len(list(set(labels))))
    plt.show()
def mean_shift_cluster_analysis(x,y,quantile=0.2,n_samples=1000):
    # ADAPTED FROM:
    # http://scikit-learn.org/stable/auto_examples/cluster/plot_mean_shift.html#example-cluster-plot-mean-shift-py
    # The following bandwidth can be automatically detected using
    X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1))))
    bandwidth = estimate_bandwidth(X, quantile=quantile, n_samples=n_samples)
    
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    
    #print("number of estimated clusters : %d" % n_clusters_)
    colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for i in xrange(len(np.unique(labels))):
        my_members = labels == i
        cluster_center = cluster_centers[i]
        plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7)
        plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i])
    tolx = (X[:,0].max()-X[:,0].min())*0.03
    toly = (X[:,1].max()-X[:,1].min())*0.03
    plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx)
    plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly)
    plt.show()
    return labels
Example #14
0
def mean(X, save_fig=False, params_labels=None, prefix='clusters'):
    '''
    Compute clustering with MeanShift
    '''
    logger.debug('Calculating MeanShift clusters using %d parameters'%len(X[0]))
    
    X = np.array( X )
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bandwidth = estimate_bandwidth(X, quantile=0.2)
    
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        ms.fit(X)
        
    labels = ms.labels_
    
    if save_fig:
        plotClusters(X, ms, method='mean', prefix=prefix,
                     params=params_labels)
    
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    
    logger.debug('Found %d clusters with MeanShift algorithm'%n_clusters_)
    
    return labels
Example #15
0
def clustering(matrix,lst,blst):
    dblabel=cluster.DBSCAN(eps=8E-4).fit_predict(matrix)
    dblabel=select(dblabel)
    print("DBScan finished.")
    kmlabel=cluster.KMeans(n_clusters=300).fit_predict(matrix)
    kmlabel=select(kmlabel)
    print("KMeans finished.")
    bw=cluster.estimate_bandwidth(matrix,quantile=0.01,n_samples=1000)
    ms=cluster.MeanShift(bandwidth=bw)
    mslabel=ms.fit_predict(matrix)
    mslabel=select(mslabel)
    print("MeanShift finished.")
    bc=cluster.Birch(threshold=0.01)
    bmat=matrix.tolist()
    bclabel=bc.fit_predict(bmat)
    bclabel=select(bclabel)
    print("Birch finished.")
    intesec=[]
    suspct=[]
    c=0
    for i in range(len(matrix)):
        #if bclabel[i]:
            #c+=1
        #if mslabel[i]:
        if dblabel[i] and kmlabel[i] and mslabel[i] and bclabel[i]:
            intesec.append(lst[i])
        if dblabel[i] or kmlabel[i] or mslabel[i] or bclabel[i]:
            suspct.append(lst[i])
    print(str(c))
    return intesec,suspct
Example #16
0
def meanshift(raw_data, t):
   # Compute clustering with MeanShift
    # The following bandwidth can be automatically detected using
    #data = [ [(raw_data[i, 1]+raw_data[i, 5]), (raw_data[i, 2]+raw_data[i,6])] for i in range(raw_data.shape[0]) ]
    data = np.zeros((raw_data.shape[0],2))
    X = raw_data[:,1] + raw_data[:,5]
    Y = raw_data[:,2] + raw_data[:,6]
    #X = raw_data[:,1] ; Y = raw_data[:,2];
    data = np.transpose(np.concatenate((np.mat(X),np.mat(Y)), axis=0))
    bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(data)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print("number of estimated clusters : %d" % n_clusters_) 
    # Plot result
    plt.figure(t)
    plt.clf()
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        my_members = labels == k
        cluster_center = cluster_centers[k]
        plt.plot(data[my_members, 0], data[my_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=14)
    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.axis('equal')
    plt.show()    
Example #17
0
def mean_shift(X):
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=1000)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    return labels, cluster_centers
Example #18
0
def train(trainingData, pklFile, clusteringAll, numberOfClusters=None):
	# ========================================================================= #
	# =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= #
	# ========================================================================= #
	if (pklFile == ''):
		os.system('rm -rf learntModel & mkdir learntModel')
		pklFile = 'learntModel/learntModel.pkl'
	
	# ========================================================================= #
	# =============== STEP 2. PERFORM CLUSTERING TO THE DATA ================== #
	# ========================================================================= #
	if (numberOfClusters == None):
		print "Running MeanShift Model..."
		bandwidth = estimate_bandwidth(trainingData)
		ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=clusteringAll)
		ms.fit(trainingData)
		joblib.dump(ms, pklFile)
		return {"numberOfClusters":len(ms.cluster_centers_), "labels": ms.labels_, "clusterCenters":ms.cluster_centers_}
	
	elif (numberOfClusters != None):
		print "Running K-Means Model..."
		kMeans = KMeans(init='k-means++', n_clusters=numberOfClusters)
		kMeans.fit(trainingData)
		joblib.dump(kMeans, pklFile)
		return {"numberOfClusters":len(kMeans.cluster_centers_), "labels": kMeans.labels_, "clusterCenters":kMeans.cluster_centers_}
Example #19
0
def simplify_data1(x):
	X = np.array(zip(x,np.zeros(len(x))), dtype=np.float)
	bandwidth = estimate_bandwidth(X, quantile=0.2)
	ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
	ms.fit(X)
	labels = ms.labels_
	cluster_centers = ms.cluster_centers_
	labels_unique = np.unique(labels)
	n_clusters_ = len(labels_unique)
	#print n_clusters_
	#exit()
	start=0
	value=0
	print x
	for k in range(n_clusters_):
	    my_members = labels == k
	    print "cluster {0}: {1}".format(k, X[my_members, 0]),np.average(X[my_members, 0])
	    value=np.average(X[my_members, 0])
	    val2=0
	    for i in xrange(start,start+len(X[my_members, 0])):
		val2+=X[i][0]
		print val2,X[i][0],i
		X[i][0]=value
	    print "FINAL",val2/len(X[my_members, 0])
	    start+=len(X[my_members, 0])
	return X[:,0]
 def meanshift_for_hough_line(self):
     # init mean shift
     pixels_of_label = {}
     points_of_label = {}
     for hough_line in self.points_of_hough_line:
         pixels = self.pixels_of_hough_line[hough_line]
         pixels = np.array(pixels)
         bandwidth = estimate_bandwidth(pixels, quantile=QUANTILE, n_samples=500)
         if bandwidth == 0:
             bandwidth = 2
         ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
         ms.fit(pixels)
         labels = ms.labels_
         labels_unique = np.unique(labels)
         n_clusters_ = len(labels_unique)
         for k in range(n_clusters_):
             label = list(hough_line)
             label.append(k)
             pixels_of_label[tuple(label)] = map(tuple, pixels[labels==k])
     for label in pixels_of_label:
         pixels = pixels_of_label[label]
         points = map(self.img.get_bgr_value, pixels)
         points_of_label[label] = points
     self.pixels_of_hough_line = pixels_of_label
     self.points_of_hough_line = points_of_label
def make(filename, precision):
    with open('test.geojson') as f:
        data = json.load(f)

    features = data['features']
    points = [
        geo['geometry']["coordinates"]
        for geo in features if pred(geo)
    ]
    print points
    ar_points = array(points).reshape(len(points) * 2, 2)
    print ar_points
    bandwidth = estimate_bandwidth(ar_points) / precision
    cluster = MeanShift(bandwidth=bandwidth)
    cluster.fit(ar_points)
    labels = cluster.labels_
    cluster_centers = cluster.cluster_centers_
    print 'clusters:', len(unique(labels))

    for i, geo in enumerate(filter(pred, features)):
        geo['geometry']["coordinates"] = [
            list(cluster_centers[labels[i*2 + j]])
            for j in range(2)
        ]

    with open(filename, 'w') as f:
        json.dump(data, f)
Example #22
0
def Mean_Shift(path):
    #importer les donnees
    data=pandas.read_csv(filepath_or_buffer=path,delimiter=',',encoding='utf-8')  
    data.drop_duplicates()
    print (data)
    #lire les donnees
    values=data[['latitude', 'longitude']].values
    print("printing values")
    print (values)
    #Mean shift
    print ("Clustering data Meanshift algorithm")
    bandwidth = estimate_bandwidth(values, quantile=0.003, n_samples=None)
    #ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=20, cluster_all=False)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True,min_bin_freq=25,cluster_all=False)
    ms.fit(values)
    data['cluster'] = ms.labels_
    data = data.sort(columns='cluster')
    data = data[(data['cluster'] != -1)]
    print (data['cluster'])
    data['cluster'] = data['cluster'].apply(lambda x:"cluster" +str(x))
    labels_unique = np.unique(ms.labels_).tolist()
    del labels_unique[0]
    # Filtering clusters centers according to data filter
    cluster_centers = DataFrame(ms.cluster_centers_, columns=['latitude', 'longitude'])
    cluster_centers['cluster'] = labels_unique
    print (cluster_centers)
    n_centers_ = len(cluster_centers)
    print("number of clusters is :%d" % n_centers_)
    # print ("Exporting clusters to {}...'.format(clusters_file)")
    data.to_csv(path_or_buf="output/points.csv", cols=['user','latitude','longitude','cluster','picture','datetaken'], encoding='utf-8')
    #print ("Exporting clusters centers to {}...'.format(centers_file)")
    cluster_centers['cluster'] = cluster_centers['cluster'].apply(lambda x:"cluster" +str(x))
    cluster_centers.to_csv(path_or_buf="output/centers.csv", cols=['latitude', 'longitude','cluster'], encoding='utf-8')
    plot_meanshift(data, cluster_centers, n_centers_)
    return 0
Example #23
0
def perform_mean_shift(data):
	X = np.c_[data]
	(n_samples, n_features) = X.shape
	bandwidth = cluster.estimate_bandwidth(X, n_samples=n_samples)
	ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
	ms.fit(X)
	import pdb
	pdb.set_trace()
def meanShiftClustering(centers_df,subject):
    #estimate the bandwidth to use with the mean shift algorithm. The quantile represents the distance used between the box centers to define the cluster. Smaller quantile, means smaller distance between points that would end up in the same cluster
    centers_df=centers_df.reset_index()
    bandwidth=estimate_bandwidth(centers_df[['center_x','center_y']].as_matrix(), quantile=0.0055)
    #instantiate the mean shift algorithm
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    #fit the algorithm on the box center coordinates
    ms.fit(centers_df[['center_x','center_y']])
    #get the resulting clustesr labels
    labels = ms.labels_
    #get the resulting centers of each *cluster*
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    #calculate the number of clusters by using the length of the list that contains all the unique labels
    n_clusters_ = len(labels_unique)

    #concatenate the centers data frame (which contains all the box coordinates, their dimensions, and their centers) with the clustering labels generated by the clustering
    boxes_df = pd.concat([centers_df,pd.DataFrame(labels,columns=['cluster_label'])],axis=1)

    #the aggregate function in the groupby, includes two functions: count and median
    f = {'Number of boxes in a cluster': ['count'],'Median': ['median']}
    #group by the label of each cluster and aggregate the boxes' top left coordinates and dimensions by applying the median
    aggregated_df = boxes_df.groupby('cluster_label')['cluster_label','tl_x','tl_y','width','height'].agg(f).reset_index()
    #change column names for a more descriptive name
    aggregated_df.columns = ['cluster_label','median_cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster','count_tl_x','count_tl_y','count_width','count_height']
    #leave out the unnecessary columns
    aggregated_df = aggregated_df[['cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster']]
    
    #Look at the output of the plotBoxes function (svg file) and determine at which THRESHOLD value there is a desired number of clusters (appears at the top of the plot) and that it visually matches the actual grid
    THRESHOLD = 5

    #filter out all the clusters that have less than a certain number of boxes in each cluster
    #use the old-weather-aggregator-with-plot.py script to check what the best threshold is
    aggregated_df = aggregated_df.loc[aggregated_df.boxes_in_cluster>THRESHOLD,:]
    good_clusters = np.unique(aggregated_df.cluster_label.values)

    print "for subject_id:"+str(subject)

    print "number of estimated clusters overall: %d" % n_clusters_

    print "number of estimated clusters, after small clusters were filtered out: %d" % len(good_clusters)

    print "clusters with more than %d boxes per cluster:" % THRESHOLD
    print aggregated_df.columns
    print aggregated_df.head()

    #save the aggregated boxes and their clusters into a csv file, separate file for each subject
    print "Saving the output/aggregated_df_%s.csv file..." % str(subject)
    aggregated_df.to_csv("output/aggregated_df_"+str(subject)+".csv",index=False)

    #make sure that only the boxes that belong to the good_clusters (have more boxes than the threshhold) remain in the boxes_df dataframe and then save the dataframe
    boxes_df = boxes_df.loc[boxes_df['cluster_label'].isin(good_clusters),:]
    print "Saving the output/clustered_df_%s.csv file..." % str(subject)
    boxes_df.to_csv("output/clustered_df_"+str(subject)+".csv",index=False)

    plotBoxes(aggregated_df,boxes_df,cluster_centers)
Example #25
0
def mean_shift(data, bandwith=None, n_samples=500, quantile=0.3):
    if bandwith is None:
        bandwidth = skcluster.estimate_bandwidth(data, 
                                                 quantile=quantile,
                                                 n_samples=n_samples)

    ms = skcluster.MeanShift(bandwidth=bandwidth).fit(data)
    labels = ms.labels_
    return labels
Example #26
0
def do_meanshift (band1, band2, band3, band4, colour1, colour2, make_plots):
    '''Does meanshift clustering to determine a number of clusters in the 
        data, which is passed to KMEANS function'''

    data = np.loadtxt(inputdata)

    #Input Checking
    #if band1 == band2 or band3 == band4: 
        #print "Not a good idea to use the same band in one colour, try again"
        #return
    #for band in [band1, band2, band3, band4]:
        #if band not in band_names.keys():
            #print "Can't find %s in band_name list" %band
            #return
        
    #Import 4 different wavelengths
    #Colour 1: 05_mag
    wave1 = data[:, band_names[band1]]
    wave2 = data[:, band_names[band2]]
    
    #Colour 2: 05_mag
    wave3 = data[:, band_names[band3]]
    wave4 = data[:, band_names[band4]]
    
    gooddata1 = np.logical_and(np.logical_and(wave1!=badval, wave2!=badval), np.logical_and(wave3!=badval, wave4!=badval)) # Remove data pieces with no value 
    gooddata2 = np.logical_and(np.logical_and(wave1<maglim, wave2<maglim), np.logical_and(wave3<maglim, wave4<maglim))
    greatdata = np.logical_and(gooddata1, gooddata2)
    
    colour1 = wave1[greatdata] - wave2[greatdata]
    colour2 = wave3[greatdata] - wave4[greatdata]
    
      
    #Truncate data
    X = np.vstack([colour1, colour2]).T

    #Scale data because meanshift generates circular clusters 
    X_scaled = preprocessing.scale(X)

    # The following bandwidth can be automatically detected using
    # the routine estimate_bandwidth(). Bandwidth can also be set
    # as a value.

    bandwidth = estimate_bandwidth(X)

    # Meanshift clustering 
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
    ms.fit(X_scaled)

    labels_unique = np.unique(ms.labels_)
    n_clusters = len(labels_unique[labels_unique >= 0])
    
    #Make plot of clusters if needed
    
    if "MSplot" in make_plot: 
        make_ms_plots(colour1, colour2, n_clusters, X, ms, band1, band2, band3, band4)
    
    return(n_clusters)
Example #27
0
def checkForClustering(catalog):
    debug("Checking for data clustering")
    Xfull = catalog.view(np.float64).reshape(catalog.shape + (-1,))[:,1:]
    X = Xfull[:,2:]
    
    
    debug("Using DBSCAN")
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clusters_DBSCAN = len(set(labels)) - (1 if -1 in labels else 0)
    debug('Estimated number of clusters with DBSCAN: %d' % n_clusters_DBSCAN)
        
    unique_labelsDBSCAN = set(labels)
    colorsDBSCAN = plt.cm.rainbow(np.linspace(0, 1, len(unique_labelsDBSCAN)))
    
    debug("Estimating clusters using MeanShift")
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labelsMS = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_uniqueMS = np.unique(labelsMS)
    n_clusters_MS = len(labels_uniqueMS)
    debug("Estimated number of clusters with MeanShift: %d" % n_clusters_MS)
    
    # Plot result
    fig = plt.figure(figsize=(12,12))
    ax0 = fig.add_subplot(2,2,1)
    ax1 = fig.add_subplot(2,2,2)
    ax2 = fig.add_subplot(2,2,3)
    ax3 = fig.add_subplot(2,2,4)
    for k, col in zip(unique_labelsDBSCAN, colorsDBSCAN):
        if k == -1:
            col = 'k'
        class_member_mask = (labels == k)
        mask = class_member_mask & core_samples_mask
        xy = Xfull[mask]
        ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)
        ax2.plot(catalog['MAG_APER(1)'][mask], catalog['CLASS_STAR'][mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)
        xy = Xfull[class_member_mask & ~core_samples_mask]
        ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)
        ax2.plot(catalog['MAG_APER(1)'][class_member_mask & ~core_samples_mask], catalog['CLASS_STAR'][class_member_mask & ~core_samples_mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)

        ax0.set_title('DBCAN: # clusters: %d' % n_clusters_DBSCAN)
        
        
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_MS), colors):
        my_members = labelsMS == k
        cluster_center = cluster_centers[k]
        ax1.plot(Xfull[my_members, 0], Xfull[my_members, 1], col + '.')
        ax3.plot(catalog['MAG_APER(1)'][my_members], catalog['CLASS_STAR'][my_members], col + '.')
        #ax1.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
    ax1.set_title('MeanShift: # clusters: %d' % n_clusters_MS)
    plt.show()
Example #28
0
def MSclusterer(X):
	X = X.toarray()
	bandwidth = estimate_bandwidth(X, quantile=0.04, n_samples=500)
	ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False)
	ms.fit(X)
	labels = ms.labels_
	labels_unique = np.unique(labels)
	n_clusters_ = len(labels_unique)
	print(n_clusters_)
	return ms.labels_
Example #29
0
def mean_shift_clustering(features):
    bandwidth = estimate_bandwidth(features, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(features)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters = len(labels_unique)
    print("-- # of clusters: %d" % n_clusters)
    return labels
def fit_mean_shift_object(x,y,quantile=.005):
    '''
    given x,y lists of x and y coordinates of points,
    we fit a Meanshift cluster object to these points
    '''
    points = make_coordinates(x=x,y=y)
    bandwidth=estimate_bandwidth(points, quantile=quantile)
    ms = MeanShift(bandwidth=bandwidth,bin_seeding=True)
    ms.fit(points)
    return ms
Example #31
0
    def zoom_neutrophils(self):

        img_size = min(self.detector.img.shape[0:2])

        neutrophils_objects = []

        # get neutrophils objects
        for i, cell_coordinates in enumerate(self.detector.overlays):
            prediction = self.convnet.predict(self.detector.cells_images[i])
            if prediction == 2:
                neutrophils_objects.append(cell_coordinates)

        if len(neutrophils_objects) == 0:
            return None, None

        quantile = 0.4 / (2**min(floor(img_size / 1000.0), 3))
        bandwidth = estimate_bandwidth(neutrophils_objects, quantile=quantile)
        if bandwidth <= 0:
            return None, None
        else:
            ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
            ms.fit(neutrophils_objects)

            labels = ms.predict(neutrophils_objects)
            clusters = []
            for label in np.unique(labels):
                clusters.append([
                    neutrophils_objects[i] for i, lbl in enumerate(labels)
                    if lbl == label
                ])

            def density_function(x, objects, ms_density, convnet_density):
                current_index = objects.index(x)
                bias_density = int(convnet_density.img_size / 2)
                center_density = (
                    int(ms_density.cluster_centers_[current_index][0]) +
                    bias_density,
                    int(ms_density.cluster_centers_[current_index][1]) +
                    bias_density)
                radius_density = int(
                    max([
                        np.linalg.norm(
                            np.array(coord) - np.array(center_density))
                        for coord in x
                    ]))
                area = pi * radius_density**2
                density = len(x) / area if len(x) > 1 else 0

                return density

            object_in_max_clusters = max(
                clusters,
                key=lambda x: density_function(x, clusters, ms, self.convnet))
            index_of_maximum = clusters.index(object_in_max_clusters)
            bias = int(self.convnet.img_size / 2)

            # get optimal center and radius of cluster
            center = (int(ms.cluster_centers_[index_of_maximum][0]) + bias,
                      int(ms.cluster_centers_[index_of_maximum][1]) + bias)
            radius = int(
                max([
                    np.linalg.norm(np.array(coord) - np.array(center))
                    for coord in object_in_max_clusters
                ]))
            return center, radius + bias
Example #32
0
    # print(predict)
    print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (
        name,
        (time() - t0),
        metrics.homogeneity_score(labels, predict),
        metrics.completeness_score(labels, predict),
        metrics.v_measure_score(labels, predict),
        metrics.adjusted_rand_score(labels, predict),
        metrics.adjusted_mutual_info_score(
            labels, predict, average_method='arithmetic'),
        # metrics.silhouette_score(data, predict,
        #                          metric='euclidean',
        #                          sample_size=n_samples)
    ))
    del estimator


AllAlgorithm(AgglomerativeClustering(n_clusters=n_digits, linkage='ward'),
             name="AC",
             data=data)
bandwidth = estimate_bandwidth(data, quantile=0.3, n_samples=sample_size)
print(39 * '_' + 'PCA' + 40 * '_')

reduced_data = PCA(n_components=10).fit_transform(data)
bandwidth = estimate_bandwidth(data, quantile=0.3, n_samples=sample_size)
AllAlgorithm(AgglomerativeClustering(n_clusters=n_digits, linkage='ward'),
             name="AC",
             data=reduced_data)

print(82 * '_')
Example #33
0
from sklearn.cluster import MeanShift
import sklearn.cluster as cluster

from sklearn.decomposition import PCA

n_cluster_num = 3
clusterer = KMeans(n_clusters=n_cluster_num, random_state=10)
cluster_labels_Kmeans = clusterer.fit_predict(df_feature)

clusterer = DBSCAN(eps=0.45)
cluster_label_DBScan = clusterer.fit_predict(df_feature)

clusterer = Birch(n_clusters=n_cluster_num)
cluster_label_Birch = clusterer.fit_predict(df_feature)

bandwidth = cluster.estimate_bandwidth(df_feature, quantile=0.15)
clusterer = MeanShift(bin_seeding=True, bandwidth=bandwidth)
cluster_label_MeanShift = clusterer.fit_predict(df_feature)

a_pca = PCA(n_components=3)
data_pca = a_pca.fit_transform(df_feature)

Y = wine.target

# Kezdjünk új ábrát (plt.figure)!
plt.figure(figsize=(20, 5))
# Rajzoljunk a plt.scatter segítségével!
# Segítség: X_pca[:, 0], X_pca[:, 1], c=Y
plt.subplot(151)
plt.xlabel("Kmeans")
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=cluster_labels_Kmeans)
# 1.
# iris_data = pd.read_csv('iris_data.csv')
iris_data = pd.read_excel('iris_data.xlsx')
# print(iris_data)

# 2.
iris_data = pd.get_dummies(iris_data, columns=['Species'])
# print(iris_data)

# 3.
virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1]
versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1]
setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1]

# virginica.plot.scatter(x=0, y=1, c='r')
# versicolor.plot.scatter(x=0, y=1, c='b')
# setosa.plot.scatter(x=0, y=1, c='g')

plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r')
plt.scatter(x=versicolor['Sepal length'],
            y=versicolor['Sepal width'],
            color='g')
plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b')
# plt.show()

# 4.
print(estimate_bandwidth(virginica, quantile=0.2))
print(estimate_bandwidth(versicolor, quantile=0.2))
print(estimate_bandwidth(setosa, quantile=0.2))
Example #35
0
def color_quantization_click_and_select(path):
	# Recreate image
	def recreate_image(codebook, labels, w, h):
		"""Recreate the (compressed) image from the code book & labels"""
		d = codebook.shape[1]
		image = np.zeros((w, h, d))
		label_idx = 0
		for i in range(w):
			for j in range(h):
				image[i][j] = codebook[labels[label_idx]]
				label_idx += 1
		return image
		
	# Read image by cv2
	path = '/home/yizi/Documents/phd/historical_map_project/image_generator/BHdV_PL_ATL20Ardt_1929_0003/image_batches/_05_06.tiff'
	image = cv2.imread(path)
	
	# Gaussian blur to smooth noise pixels
	image = cv2.GaussianBlur(image, (3, 3), cv2.BORDER_DEFAULT)
	image_file_name = os.path.basename(path).split('.')[0]
	
	# Change color space from BGR to RGB to HLS
	image = cv2.cvtColor(image, cv2.COLOR_BGR2HLS)
	image = np.array(image)  # Change image objects into array

	plt.imshow(image)
	plt.show()
	# The following bandwidth can be automatically detected using
	image_reshape = image.reshape((image.shape[0] * image.shape[1], image.shape[2]))
	bandwidth = estimate_bandwidth(image_reshape, quantile=0.1, n_samples=500)

	ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=10)
	ms.fit(image_reshape)
	labels = ms.predict(image_reshape)
	cluster_centers = ms.cluster_centers_.astype(np.uint8)

	labels_unique = np.unique(labels)
	n_clusters_ = len(labels_unique)

	image = recreate_image(cluster_centers, labels, image.shape[0], image.shape[1]).astype(np.uint8)
	
	print("Reduce color through mean-shift: %d" % n_clusters_)

	select_color = color_picker(image)
	print(select_color)
	
	@nb.njit
	def euc(a, b):
		return ((b - a) ** 2).sum(axis=0) ** 0.5
	
	segmentation_image = {key: None for key in list(select_color.keys())}
	for key, value in select_color.items():
		image_copy = image.copy().astype(np.uint8)
		if list(value) == list(np.array([0, 0, 0])):
			segmentation_image[key] = np.zeros((image.shape[0], image.shape[1], image.shape[2]))
		
		# Define the color you're looking for
		pattern = np.array(value).astype(np.uint8)
		
		# Make a mask to use with where
		mask = (image_copy == pattern).all(axis=2)
		newshape = mask.shape + (1,)
		mask = mask.reshape(newshape)
		image_copy = np.where(mask, [255, 255, 255], [0, 0, 0])
		image_copy = image_copy.astype(np.uint8)
		if key == 'red_legend' or key == 'black_text':
			image_copy = remove_noise_pixels(image_copy, threshold=15)
		else:
			image_copy = cv2.bitwise_not(image_copy)
			image_copy = remove_noise_pixels(image_copy, threshold=15)
		segmentation_image[key] = image_copy
		
	fig = plt.figure(figsize=(25, 25))
	for index, (layer, seg_image) in enumerate(segmentation_image.items()):
		ax = fig.add_subplot(2, 2, index + 1)
		ax.imshow(seg_image)
		ax.axis('off')
		ax.set_title(' %s _layer' % layer)
	plt.show()
	
	for nl in range(len(segmentation_image)):
		current_dir = os.getcwd()
		file_name = path.split('/')[-3]
		image_quantization_result_dir = str(Path(current_dir).parent) + '/image_generator/' + file_name + \
										'/color_quantization_result_batches/' + str(nl) + '_layer/'

		if not os.path.exists(image_quantization_result_dir):
			os.makedirs(image_quantization_result_dir)
			print("Directory ", image_quantization_result_dir, " Created ")
		else:
			print("Directory ", image_quantization_result_dir, " already exists")

		save_path = image_quantization_result_dir + image_file_name + '.p'

		# Save image into pickle file for saving memeory
		with open(save_path, 'wb') as handle:
			pickle.dump(segmentation_image[list(segmentation_image.keys())[nl]],
						handle, protocol=pickle.HIGHEST_PROTOCOL)
Example #36
0
def plot_results(sc=0,
                 sr=0,
                 sv=1,
                 sv2=0,
                 srs=0,
                 mode=[],
                 w_positions=False,
                 scale=False,
                 algo_params=[],
                 rng=10,
                 cinds=[0, 2, 3, 4, 7, 8, 9, 10],
                 return_vs=0,
                 return_bars=0,
                 dinds=[0, 1, 2, 3, 4],
                 save=False,
                 title='misc',
                 red=[],
                 out=True,
                 return_AIC=False,
                 fake=True,
                 **kwargs):
    '''
    sc: bool
    show clusters plot
    sr: bool
    show reconstruction plot
    sv: bool
    show vscores
    mode: string
    [] = pca, 'all'= all, 'de' = dual energy,'se' = integrating detector 
    W_position: bool
    Add positions to the vectors
    scale: bool
    Rescale the input, this is important if you add the position data
    default_base: dict
    Parameters for the clustering algorithms
    rng: int
    seed
    cinds: list
    which clustering methods to try
    red: string
    Which demensional REDuction 'ica', 'nmf' or 'tsne'
    '''

    np.random.seed(rng)

    # ============
    # Set up cluster parameters
    # ============
    if sc:
        plt.figure(1, figsize=(9 * 2 + 3, 12.5))
        plt.subplots_adjust(left=.02,
                            right=.98,
                            bottom=.001,
                            top=.96,
                            wspace=.05,
                            hspace=.01)
    if sr:
        plt.figure(2, figsize=(9 * 2 + 3, 12.5))
        plt.subplots_adjust(left=.02,
                            right=.98,
                            bottom=.001,
                            top=.96,
                            wspace=.05,
                            hspace=.01)
    if srs:
        plt.figure(3, figsize=(9 * 2 + 3, 12.5))
        plt.subplots_adjust(left=.02,
                            right=.98,
                            bottom=.001,
                            top=.96,
                            wspace=.05,
                            hspace=.01)

    plot_num = 1

    # This dictionary defines the colormap
    cdict = {
        'red': (
            (0.0, 0.0, 0.0),  # no red at 0
            (0.5, 1.0, 1.0),  # all channels set to 1.0 at 0.5 to create white
            (1.0, 0.8, 0.8)),  # set to 0.8 so its not too bright at 1
        'green': (
            (0.0, 0.8, 0.8),  # set to 0.8 so its not too bright at 0
            (0.5, 1.0, 1.0),  # all channels set to 1.0 at 0.5 to create white
            (1.0, 0.0, 0.0)),  # no green at 1
        'blue': (
            (0.0, 0.0, 0.0),  # no blue at 0
            (0.5, 1.0, 1.0),  # all channels set to 1.0 at 0.5 to create white
            (1.0, 0.0, 0.0))  # no blue at 1
    }

    # Create the colormap using the dictionary
    P = color.LinearSegmentedColormap('GnRd', cdict)

    h**o, comp, vs, idata, ialgo = [], [], [], [], []

    if fake:
        data = (('glass', 'Glass'), ('pp', 'Poly'), ('bb', 'Bluebelt'),
                ('ptfe', 'PTFE'), ('steel', 'Steel'))
        datasets2 = [data[j] for j in dinds]
    else:
        datasets2 = (('chick_glass', 'Glass'), ('chick_bluebelt', 'Bluebelt'))

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -60,
        'n_neighbors': 2,
        'n_clusters': 2,
        'linkage': 'ward',
        'affinity': "nearest_neighbors",
        'assign_labels': 'kmeans',
        'min_samples': 10,
        'ct': 'spherical',
        'branching': 19,
        'threshold': 0.0001,
        'metric': 'minkowski',
        'asc': False,
        'p': 2,
        'mcs': 10,
        'nc': 2
    }

    aic = []

    for i_dataset, (dataset, dat_name) in enumerate(datasets2):

        # update parameters with dataset-specific values
        params = default_base.copy()
        params.update(algo_params)

        #         if outlier_detect:
        #             X = loadmat('2'+dataset)['Z']

        if mode == 'de':
            X = loadmat('all' + dataset)['Z'][:, 1:]
            X = np.column_stack((np.sum(X[:, 1:3], 1), np.sum(X[:, 4:], 1)))
        elif mode == 'se':
            X = loadmat('all' + dataset)['Z'][:, 0]
            X = np.reshape(X, (20 * 68, 1), order="F")
        elif mode == 'all':
            X = loadmat('all' + dataset)['Z'][:, 1:]
        elif mode == 'small':
            X = loadmat('small_' + dataset)['Z']
        elif mode == 'gauss':
            X = loadmat('all' + dataset)['Z'][:, 1:]
            for jj in range(0, 5):
                r2 = np.reshape(X[:, jj], (20, 68), order="F")
                X[:, jj] = np.reshape(gaussian_filter1d(r2, sigma=1),
                                      20 * 68,
                                      order="F")
            X = PCA(n_components=2).fit_transform(X)
        else:
            X = loadmat('2' + dataset)['Z']

        label_true = loadmat('2' + dataset + '_mask')['BW']

        if not fake:
            X = X[400:, :].copy()
            label_true = label_true[400:].copy()

        if w_positions:
            xx, yy = np.meshgrid(range(68), range(20))
            x = np.reshape(xx, (20 * 68, 1), order="F")
            y = np.reshape(yy, (20 * 68, 1), order="F")
            X = np.concatenate((X, x, y), axis=1)

        if scale:
            X = StandardScaler().fit_transform(X)

        if red == 'ica':
            X = FastICA(n_components=params['nc'],
                        whiten=True).fit_transform(X)
        if red == 'icapca':
            X = FastICA(n_components=5).fit_transform(X)
            X = PCA(n_components=params['nc']).fit_transform(X)
        if red == 'tsne':
            X = TSNE(n_components=params['nc']).fit_transform(X)
        if red == 'nmf':
            X = NMF(n_components=params['nc']).fit_transform(X)

        if red == 'pca':
            X = PCA(n_components=params['nc']).fit_transform(X)
        if red == 'spec':
            X = SpectralEmbedding(n_components=3).fit_transform(X)

        # estimate bandwidth for mean shift
#         if mode == 'se':
#             bandwidth = None
#         else:
        bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

        #         if mode != 'se':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(X,
                                        n_neighbors=params['n_neighbors'],
                                        include_self=False)
        connectivity = 0.5 * (connectivity + connectivity.T)
        #         else:
        #             connectivity = None

        # ============
        # Create cluster objects
        # ============
        ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
        ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                               linkage=params['linkage'],
                                               connectivity=connectivity)
        spectral = cluster.SpectralClustering(
            n_clusters=params['n_clusters'],
            eigen_solver='arpack',
            affinity=params['affinity'],
            assign_labels=params['assign_labels'])
        dbscan = cluster.DBSCAN(eps=params['eps'],
                                min_samples=params['min_samples'],
                                metric=params['metric'],
                                p=params['p'])
        affinity_propagation = cluster.AffinityPropagation(
            damping=params['damping'], preference=params['preference'])
        average_linkage = cluster.AgglomerativeClustering(
            linkage="complete",
            affinity="cityblock",
            n_clusters=params['n_clusters'],
            connectivity=connectivity)
        birch = cluster.Birch(n_clusters=params['n_clusters'],
                              branching_factor=params['branching'],
                              threshold=params['threshold'])
        gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                      covariance_type=params['ct'])
        bgmm = mixture.BayesianGaussianMixture(
            n_components=params['n_clusters'], covariance_type=params['ct'])
        hdb = hdbscan.HDBSCAN(min_samples=params['min_samples'],
                              min_cluster_size=params['mcs'],
                              metric=params['metric'],
                              allow_single_cluster=params['asc'],
                              p=params['p'],
                              **kwargs)

        cinds_all = (('KMeans', two_means), ('AffinityPropagation',
                                             affinity_propagation),
                     ('MeanShift', ms), ('SpectralClustering', spectral),
                     ('Ward', ward), ('AgglomerativeClustering',
                                      average_linkage), ('DBSCAN', dbscan),
                     ('Birch', birch), ('HDBSCAN', hdb),
                     ('GaussianMixture', gmm), ('BGaussianMixture', bgmm))

        clustering_algorithms = [cinds_all[j] for j in cinds]

        for i_algorithm, (name, algorithm) in enumerate(clustering_algorithms):

            t0 = time.time()

            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the " +
                    "connectivity matrix is [0-9]{1,2}" +
                    " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning)
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding"
                    + " may not work as expected.",
                    category=UserWarning)
                if hasattr(algorithm, 'condensed_tree_'):
                    pass
                else:
                    algorithm.fit(X)

            t1 = time.time()
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            elif hasattr(algorithm, 'condensed_tree_'):
                y_pred = algorithm.fit_predict(X)
            else:
                y_pred = algorithm.predict(X)

            homo1, comp1, vs1 = homogeneity_completeness_v_measure(
                label_true.squeeze(), y_pred)

            if return_AIC:
                if hasattr(algorithm, 'aic'):
                    aic.append(algorithm.aic(X))

            if sc:
                plt.figure(1)

                plt.rcParams['axes.facecolor'] = P(1 - vs1, alpha=0.5)

                plt.subplot(len(datasets2), len(clustering_algorithms),
                            plot_num)
                if i_dataset == 0:
                    plt.title(name, size=18)
                if i_algorithm == 0:
                    plt.ylabel(dat_name, size=18)

                colors2 = np.array(
                    list(
                        islice(
                            cycle([
                                'r', 'b', '#4daf4a', '#f781bf', '#a65628',
                                '#984ea3', '#999999', '#e41a1c', '#dede00'
                            ]), int(max(y_pred) + 1))))

                # add black color for outliers (if any)
                colors2 = np.append(colors2, ["#000000"])
                #import ipdb; ipdb.set_trace()
                X2 = X[label_true.squeeze() == 0, :]
                plt.scatter(X2[:, 0],
                            X2[:, 1],
                            s=40,
                            color=colors2[y_pred[label_true.squeeze() == 0]])

                X2 = X[label_true.squeeze() == 1, :]
                plt.scatter(X2[:, 0],
                            X2[:, 1],
                            s=40,
                            color=colors2[y_pred[label_true.squeeze() == 1]],
                            marker='x')

                plt.xticks(())
                plt.yticks(())
                plt.text(.99,
                         .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                         transform=plt.gca().transAxes,
                         size=15,
                         horizontalalignment='right')
                plt.text(.99,
                         .89, ('%.2f' % vs1).lstrip('0'),
                         transform=plt.gca().transAxes,
                         size=15,
                         horizontalalignment='right')


#             if sr | srs:
#                 aa = y_pred[1320]

#                 if aa != 0:
#                     y_pred[y_pred == y_pred.min()] = y_pred.max() + 1
#                     y_pred[y_pred == aa] = y_pred.min()
            if sr:
                plt.figure(2)
                if fake:
                    r = np.reshape(y_pred, (20, 48), order="F")
                else:
                    r = np.reshape(y_pred, (20, 48), order="F")
                plt.subplot(len(datasets2), len(clustering_algorithms),
                            plot_num)
                if i_dataset == 0:
                    plt.title(name, size=18)
                plt.imshow(r)
                plt.set_cmap('bwr')
                plt.xticks([])
                plt.yticks([])
                if i_algorithm == 0:
                    plt.ylabel(dat_name, size=18)
            if srs:
                plt.figure(3)
                r = np.reshape(y_pred, (20, 68), order="F")
                p = np.reshape(label_true, (20, 68), order="F")
                plt.subplot(len(datasets2), len(clustering_algorithms),
                            plot_num)
                # Bin all of the bins to one and zero

                r[r == r.min()] = 0
                r[r > r.min()] = 1

                if i_dataset == 0:
                    plt.title(name, size=18)
                plt.imshow(abs(p - r))
                plt.set_cmap('gray')
                plt.xticks([])
                plt.yticks([])
                if i_algorithm == 0:
                    plt.ylabel(dat_name, size=18)

            h**o.append(homo1)
            comp.append(comp1)
            vs.append(vs1)

            idata.append(i_dataset)
            ialgo.append(i_algorithm)

            plot_num += 1

    if save:
        plt.figure(1)
        plt.savefig('scatter_' + title + '.png')
        plt.figure(2)
        plt.savefig('recon_' + title + '.png')

    plt.show()

    if sv:
        plt.rcParams['axes.facecolor'] = (0, 0, 0)
        vs = np.asarray(vs)
        bars = []
        n_components_range = range(len(clustering_algorithms))
        cv_types = [item[1] for item in datasets2]

        color_iter = cycle(
            ['navy', 'turquoise', 'cornflowerblue', 'darkorange', 'k'])
        # Plot the BIC scores
        plt.figure(figsize=(8, 6))
        spl = plt.subplot(1, 1, 1)

        for i, (cv_type, pcolor) in enumerate(zip(cv_types, color_iter)):
            xpos = np.array(n_components_range) + .166 * (i - 2)
            bars.append(
                plt.bar(xpos,
                        vs[i * len(n_components_range):(i + 1) *
                           len(n_components_range)],
                        width=.166,
                        color=pcolor))
        plt.xticks(n_components_range,
                   [item[0] for item in clustering_algorithms])
        plt.xticks()
        plt.ylim([0, 1])
        plt.title('V score per model')
        xpos = np.mod(vs.argmax(), len(n_components_range)) + .65 +\
            .16 * np.floor(vs.argmax() / len(n_components_range))
        plt.text(xpos, vs.min() * 0.97 + .03 * vs.max(), '*', fontsize=14)
        spl.set_xlabel('Algorithm')
        spl.legend([b[0] for b in bars], cv_types)
        plt.tight_layout()

    if sv2:

        vs = np.asarray(vs)
        bars = []
        colors = []
        n_components_range = range(len(clustering_algorithms))
        cv_types = [item[1] for item in datasets2]

        color_iter = cycle(
            ['navy', 'turquoise', 'cornflowerblue', 'darkorange', 'k'])
        # Plot the BIC scores
        if out:
            plt.figure(figsize=(8, 6))
            plt.rcParams['axes.facecolor'] = (1, 1, 1)
            spl = plt.subplot(1, 1, 1)

        for i, (cv_type, pcolor) in enumerate(zip(cv_types, color_iter)):
            xpos = np.array(n_components_range) + 0.166 * (i - 2)
            bars.append(vs[i * len(n_components_range):(i + 1) *
                           len(n_components_range)])

        # import ipdb; ipdb.set_trace()
        [colors.append(colourblind(col)) for col in n_components_range]

        bars2 = np.mean(np.asarray(bars), axis=0)

        indeces = np.argsort(bars2)
        #         import ipdb; ipdb.set_trace()
        bars2.sort()
        clustering_algorithms = [clustering_algorithms[i] for i in indeces]

        if out:
            plt.bar(n_components_range, bars2, color=colors)
            plt.xticks(n_components_range,
                       [item[0] for item in clustering_algorithms],
                       rotation=45,
                       ha="right")
            plt.xticks()
            plt.ylim([0, 1])
            plt.title('V Averaged over all Materials')
            xpos = np.mod(vs.argmax(), len(n_components_range)) + .65 +\
                .16 * np.floor(vs.argmax() / len(n_components_range))
            plt.text(xpos, vs.min() * 0.97 + .03 * vs.max(), '*', fontsize=14)
            spl.set_xlabel('Algorithm')
            spl.set_ylabel('V-score')
            #spl.legend([b[0] for b in bars], cv_types)
            plt.tight_layout()

        if return_bars:
            return bars
    if return_vs:
        return vs
    if return_AIC:
        return aic
def main():
    print("# Start Spectral Clustering")

    print("# Load Fachion MNIST dataset")

    np.random.seed(0)

    # ============
    # Generate datasets. We choose the size big enough to see the scalability
    # of the algorithms, but not too big to avoid too long running times
    # ============
    n_samples = 1500
    noisy_circles = datasets.make_circles(n_samples=n_samples,
                                          factor=.5,
                                          noise=.05)
    noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
    blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    # Anisotropicly distributed data
    random_state = 170
    X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)

    # blobs with varied variances
    varied = datasets.make_blobs(n_samples=n_samples,
                                 cluster_std=[1.0, 2.5, 0.5],
                                 random_state=random_state)

    # ============
    # Set up cluster parameters
    # ============
    plt.figure(figsize=(9 * 2 + 3, 12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)

    plot_num = 1

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    _datasets = [(noisy_circles, {
        'damping': .77,
        'preference': -240,
        'quantile': .2,
        'n_clusters': 2
    }), (noisy_moons, {
        'damping': .75,
        'preference': -220,
        'n_clusters': 2
    }), (varied, {
        'eps': .18,
        'n_neighbors': 2
    }), (aniso, {
        'eps': .15,
        'n_neighbors': 2
    }), (blobs, {}), (no_structure, {})]

    for i_dataset, (dataset, algo_params) in enumerate(_datasets):
        # update parameters with dataset-specific values
        params = default_base.copy()
        params.update(algo_params)

        X, y = dataset

        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(X,
                                        n_neighbors=params['n_neighbors'],
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)

        # ============
        # Create cluster objects
        # ============
        ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
        ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                               linkage='ward',
                                               connectivity=connectivity)
        spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                              eigen_solver='arpack',
                                              affinity="nearest_neighbors")
        dbscan = cluster.DBSCAN(eps=params['eps'])
        affinity_propagation = cluster.AffinityPropagation(
            damping=params['damping'], preference=params['preference'])
        average_linkage = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            n_clusters=params['n_clusters'],
            connectivity=connectivity)
        birch = cluster.Birch(n_clusters=params['n_clusters'])
        gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                      covariance_type='full')

        clustering_algorithms = (('MiniBatchKMeans', two_means),
                                 ('AffinityPropagation', affinity_propagation),
                                 ('MeanShift', ms), ('SpectralClustering',
                                                     spectral), ('Ward', ward),
                                 ('AgglomerativeClustering',
                                  average_linkage), ('DBSCAN', dbscan),
                                 ('Birch', birch), ('GaussianMixture', gmm))

        for name, algorithm in clustering_algorithms:
            t0 = time.time()

            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the " +
                    "connectivity matrix is [0-9]{1,2}" +
                    " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning)
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding"
                    + " may not work as expected.",
                    category=UserWarning)
                algorithm.fit(X)

            t1 = time.time()
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)

            plt.subplot(len(_datasets), len(clustering_algorithms), plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)

            colors = np.array(
                list(
                    islice(
                        cycle([
                            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                            '#a65628', '#984ea3', '#999999', '#e41a1c',
                            '#dede00'
                        ]), int(max(y_pred) + 1))))
            # add black color for outliers (if any)
            colors = np.append(colors, ["#000000"])
            plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

            plt.xlim(-2.5, 2.5)
            plt.ylim(-2.5, 2.5)
            plt.xticks(())
            plt.yticks(())
            plt.text(.99,
                     .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                     transform=plt.gca().transAxes,
                     size=15,
                     horizontalalignment='right')
            plot_num += 1

    #plt.show()
    if not os.path.exists(save_dir):
        os.system("mkdir -p {}".format(save_dir))
    plt.savefig(os.path.join(save_dir, "clustering_methods_result.png"))
                    top=.96,
                    wspace=.05,
                    hspace=.05)

plot_num = 1

count = 0

datasets = [actors, producers, writers, directors]
for i_dataset, dataset in enumerate(datasets):
    X, y = dataset
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # create clustering estimators
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=2)
    ward = cluster.AgglomerativeClustering(n_clusters=2,
                                           linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=2,
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
def do_clustering(csps_data):
    global db_time, feature_set, global_args, themes
    # n_clusters = cluster_options['n_clusters']
    # id measure response  year organisation group  score
    # print(csps_data.head())

    # First get data fram into the right shape
    if (feature_set == 'demographics'):
        #print(csps_data.head())
        #csps_data = csps_data.set_index(['organisation', 'org', 'year'])
        csps_data = pd.pivot_table(csps_data,
                                   values='score',
                                   index=[
                                       'organisation', 'year', 'headcount',
                                       'org', 'gender_offset',
                                       'ethnic_percentage',
                                       'disabled_percentage', 'headcount_delta'
                                   ],
                                   columns=['measure'],
                                   aggfunc=np.sum)
        #csps_data = csps_data.dropna(axis=1)
        #print(csps_data.head())
        #'organisation', 'year', 'headcount', 'org', 'gender_offset',  'ethnic_percentage', 'disabled_percentage', 'headcount_delta', 'measure', 'score'
    else:
        # csps_data2 = csps_data.pivot(index='org', columns='measure', values='score')
        csps_data = pd.pivot_table(
            csps_data,
            values='score',
            index=['organisation', 'year', 'headcount', 'org', 'par'],
            columns=['measure'],
            aggfunc=np.sum)
        csps_data = csps_data.dropna(axis=1)
        #csps_data = pd.pivot_table(csps_data, values='score', index=['organisation', 'year', 'headcount', 'org', 'par'], columns=['measure'], aggfunc=np.sum, fill_value=-1)
        #csps_data = pd.pivot_table(csps_data, values='score', index=['organisation', 'year', 'org', 'par'], columns=['measure', 'headcount'], aggfunc=np.sum)

    #print(feature_set)
    #print(csps_data2.head())
    #print('*' * 50)
    #print(csps_data2.head())

    # Now, because the EEI is required later and therefore retrieved, but is to be excluded from the questions and demographics, split the EEI column out and delete

    eei = csps_data['EEI']
    #print(eei.tolist())

    #     if (feature_set != 'zzzzthemes'):
    #         csps_data = csps_data.drop('EEI', 1)

    #print(csps_data.head())
    #print(list(csps_data.columns.values))
    column_names = csps_data.columns.values.tolist()

    inx = intersect(column_names, themes)
    #     print(column_names)
    #     print(themes)
    #     print(inx)

    df_parts = []
    #df_parts.append(csps_data['EEI'])

    ######################################################################################
    # TODO - need to store the columns that will be dropped before removing them for clustering
    ######################################################################################
    for m in inx:
        df_parts.append(csps_data[m])

    csps_data = csps_data.drop('EEI', 1)
    # don't drop theme columns from theme feature set!
    # also demographics feature doesn't contain themes anyway
    # so just do it for 'questions', 'ew_questions', NOT 'themes', 'demographics'
    if ((feature_set == 'questions') or (feature_set == 'ew_questions')):
        csps_data = csps_data.drop(inx, 1)

    #print( '*' * 80 )
    #print(df_parts)
    #print( '*' * 80 )
    #print(csps_data.head())

    # The data should always be a 2D array, shape (n_samples, n_features)
    # print(csps_data.head())

    # To get the boolean mask where values are nan
    # cpvnm: CSPS data, pivoted, null mask
    # csps_data = pd.isnull(csps_data)
    # print(csps_data.head())
    '''
    if (feature_set == 'themes'):
        
        dist_test1 = csps_data['EEI'].tolist()
        dist_test2 = csps_data['MW'].tolist()
#         print(dist_test.head())
#         dist_test.reset_index(True)
#         print(dist_test.head())
        print(dist_test1)
        print(dist_test2)
        zz = zip(dist_test1, dist_test2)
        
        print(map(list, zz))
        
        from sklearn.metrics.pairwise import euclidean_distances
        X_pairs = [[0, 1], [1, 1]]
        # distance between rows of X
        print(euclidean_distances(dist_test1, dist_test2))
#         print(euclidean_distances(X_pairs, X_pairs))
        # array([[ 0.,  1.], [ 1.,  0.]])
        # get distance to origin
#         print(euclidean_distances(X_pairs, [[0, 0]]))
        # array([[ 1.        ], [ 1.41421356]])
    '''

    #print(csps_data.columns)

    # Filling missing data: CSPS data, pivoted, no-null
    # csps_data = csps_data.fillna(value=0)
    # print(csps_data.head())

    #'KMeans', 'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift', 'SpectralClustering', 'AgglomerativeClustering', 'DBSCAN', 'Birch'

    # normalize dataset for easier parameter selection
    try:
        X = StandardScaler().fit_transform(csps_data)
    except:
        print("ERROR")
        print(csps_data)

    start_cluster_time = timer()
    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    if (algorithm == 'KMeans'):
        clustered = cluster.KMeans(n_clusters=cluster_options['n_clusters'])

    elif (algorithm == 'MiniBatchKMeans'):
        clustered = cluster.MiniBatchKMeans(
            n_clusters=cluster_options['n_clusters'])

    elif (algorithm == 'AffinityPropagation'):
        clustered = cluster.AffinityPropagation()

    elif (algorithm == 'MeanShift'):
        bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
        clustered = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

    elif (algorithm == 'SpectralClustering'):
        clustered = cluster.SpectralClustering(
            n_clusters=cluster_options['n_clusters'],
            eigen_solver='arpack',
            affinity="nearest_neighbors")

    elif (algorithm == 'AffinityPropagation'):
        clustered = cluster.AffinityPropagation(damping=.9, preference=-200)

    elif (algorithm == 'AgglomerativeClustering'):
        clustered = cluster.AgglomerativeClustering(
            linkage='ward',
            n_clusters=cluster_options['n_clusters'],
            connectivity=connectivity)

    elif (algorithm == 'AC_average_linkage'):
        clustered = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            n_clusters=cluster_options['n_clusters'],
            connectivity=connectivity)

    elif (algorithm == 'DBSCAN'):
        clustered = cluster.DBSCAN(eps=.5, algorithm='auto', leaf_size=40)

    elif (algorithm == 'Birch'):
        clustered = cluster.Birch(n_clusters=cluster_options['n_clusters'])

    else:
        clustered = cluster.KMeans(n_clusters=cluster_options['n_clusters'])

    clustered.fit(X)

    if (algorithm == 'MeanShift' or algorithm == 'DBSCAN'):
        silhouette_score = -1
    else:
        silhouette_score = metrics.silhouette_score(X,
                                                    clustered.labels_,
                                                    metric='euclidean')


#     neigh = NearestNeighbors(2, 0.4)
#     neigh.fit(X)
#     NearestNeighbors(algorithm='auto', leaf_size=30)
#     nbrs = neigh.radius_neighbors([[0, 0, 1.3]], 0.4, return_distance=False)
#     rng = neigh.radius_neighbors([X[1]])
#     print('NearestNeighbors')
#     print(X.shape[1])
#     print(np.asarray(rng[0][0]))

    end_cluster_time = timer()
    # this works, but isn't useful any more
    # csps_data['cluster_id'] = clustered.labels_
    if (feature_set != 'demographics'):
        #         org_year = zip(*csps_data.index.values)  #['organisation', 'year', 'org', 'par']
        #         orgs = pd.Series(org_year[0])
        #         years = pd.Series(org_year[1])
        #         org_acronym = pd.Series(org_year[2])
        #         par_acronym = pd.Series(org_year[3])
        #         clusters = pd.Series(clustered.labels_.tolist())
        org_year = zip(*csps_data.index.values
                       )  #['organisation', 'year', 'headcount', 'org', 'par']
        orgs = pd.Series(org_year[0])
        years = pd.Series(org_year[1])
        headcount = pd.Series(org_year[2])
        org_acronym = pd.Series(org_year[3])
        par_acronym = pd.Series(org_year[4])
        clusters = pd.Series(clustered.labels_.tolist())
    else:
        org_year = zip(
            *csps_data.index.values)  #['organisation', 'org', 'year']
        orgs = pd.Series(org_year[0])
        years = pd.Series(org_year[2])
        org_acronym = pd.Series(org_year[1])
        clusters = pd.Series(clustered.labels_.tolist())

    org_year.append(clustered.labels_.tolist())

    #1 - organisation
    df = pd.DataFrame(orgs)  #, 'organisation'
    df.columns = ['organisation']

    #2 - year
    df['year'] = years

    #3 - headcount
    if (feature_set != 'demographics'):
        df['headcount'] = headcount
    else:
        df['headcount'] = np.array([0] * len(df))  #csps_data['headcount']

    #4 - cluster id
    df['cluster'] = clusters

    #5 - acronym
    df['org'] = org_acronym

    #6 - parent
    if (feature_set != 'demographics'):
        df['parent'] = par_acronym
    else:
        df['parent'] = np.array(['x'] * len(df))

    #7 - EEI
    df['EEI'] = eei.tolist()
    #     if (feature_set != 'themes'):
    #         df['EEI'] = eei.tolist()
    #     else:
    #         df['EEI'] = csps_data['EEI']

    i = 0
    for m in inx:
        df[m] = df_parts[i].tolist()
        i = i + 1

    #category_labels = ['EEI', 'headcount', 'year']
    #category_labels.extend(inx)
    category_labels = df.columns.values.tolist()

    # descriptive statistics for each cluster
    #df[df.A > 0]
    #df.groupby('cluster')
    #cluster_info = df.groupby(['cluster']).get_group(1)
    #grouped = df(['EEI', 'headcount', 'cluster']).groupby('cluster')
    grouped = df.groupby('cluster')
    cluster_info = grouped.describe().fillna('missing')
    #     for name, group in grouped:
    #         print(name)
    #         print(group)

    #df = df.sort_values(by='cluster')

    # use describe to show quick summary statistics of the data
    #df.describe();
    end_time = timer()
    cluster_time = (end_cluster_time - start_cluster_time)
    total_time = (end_time - start_time)

    if (algorithm == 'AffinityPropagation'):
        other_output = json.dumps([{
            'silhouette_score': silhouette_score,
            'db_time': db_time,
            'cluster_time': cluster_time,
            'total_time': total_time,
            'feature_set': feature_set,
            'cluster_info': cluster_info.values.tolist(),
            'category_labels': category_labels
        },
                                   clustered.cluster_centers_indices_.tolist()
                                   ])

    output = json.dumps([{
        'silhouette_score': silhouette_score,
        'db_time': db_time,
        'cluster_time': cluster_time,
        'total_time': total_time,
        'feature_set': feature_set,
        'cluster_info': cluster_info.values.tolist(),
        'category_labels': category_labels
    },
                         df.values.tolist()])
    #     output = json.dumps(other_output)

    return output
Example #40
0
def do():
    ai = AI()
    ai.load()
    # ai.learn()
    params = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }
    bandwidth = cluster.estimate_bandwidth(ai.x, quantile=params['quantile'])
    connectivity = kneighbors_graph(ai.x,
                                    n_neighbors=params['n_neighbors'],
                                    include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                           linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average",
        affinity="cityblock",
        n_clusters=params['n_clusters'],
        connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                  covariance_type='full')
    clustering_algorithms = (('MiniBatchKMeans', two_means),
                             ('AffinityPropagation', affinity_propagation),
                             ('MeanShift', ms), ('SpectralClustering',
                                                 spectral), ('Ward', ward),
                             ('AgglomerativeClustering',
                              average_linkage), ('DBSCAN', dbscan),
                             ('Birch', birch), ('GaussianMixture', gmm))

    for name, algorithm in clustering_algorithms:
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message="the number of connected components of the " +
                "connectivity matrix is [0-9]{1,2}" +
                " > 1. Completing it to avoid stopping the tree early.",
                category=UserWarning)
            warnings.filterwarnings(
                "ignore",
                message="Graph is not fully connected, spectral embedding" +
                " may not work as expected.",
                category=UserWarning)
            try:
                algorithm.fit(ai.x)
            except:
                continue

        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(numpy.int)
        else:
            y_pred = algorithm.predict(ai.x)
        if max(y_pred) > 3:
            continue
        known_groups = {}
        for i, group in enumerate(ai.y):
            group = int(group)
            if group not in known_groups:
                known_groups[group] = []
            known_groups[group].append(i)
        guessed_groups = {}
        for i, group in enumerate(y_pred):
            if group not in guessed_groups:
                guessed_groups[group] = []
            guessed_groups[group].append(i)
        for k in known_groups:
            for g in guessed_groups:
                print(
                    k, g,
                    len(set(known_groups[k]).intersection(guessed_groups[g])))
Example #41
0
    def clusters(self, G):
        '''
        Finds the clusters
        '''
        # imports from a machine learning package skit-learn
        from sklearn.cluster import MeanShift, estimate_bandwidth

        # creates a array with the 2D coordinats for each node
        X = [[node[0], node[1]] for node in G.nodes()]
        # extimates the dimensions of single clusters
        bandwidth = estimate_bandwidth(X,
                                       quantile=0.1,
                                       random_state=0,
                                       n_jobs=1)
        # find clustes
        ms = MeanShift(bandwidth=bandwidth)
        ms.fit(X)

        # labels is an array indicating, for each node, the cluster number
        labels = {node: ms.labels_[i] for i, node in enumerate(G.nodes())}

        # --- ADDUCTION ---
        '''
        # add cluster centers to the graph
        for node in cluster_centers:
            attribute = {'label': 'water tower', 'pos': node}
            G.add_node(node, attribute)
            attribute = {'type' : 'sink'}
            self.sinksource_graph.add_node(node, attribute)
        '''
        adduction = nx.Graph()
        cluster_centers = [(node[0], node[1]) for node in ms.cluster_centers_]
        for node in cluster_centers:
            adduction.add_node(node)
        self.complete_graph(adduction)
        adduction = self.mesh_graph(adduction, weight='dist')

        print(len(adduction.edges()))

        nx.draw_networkx(adduction)
        # coord = {elem[0]: [elem[0][0], elem[0][1]] for elem in adduction.nodes(data=True)}
        # nx.draw_networkx(adduction, pos=coord, label=False)
        self.write2shp(adduction, "adduction_network")
        self.acqueduct.add_edges_from(adduction.edges())

        # --- DISTRIBUTION ---
        # add label info to the graph
        nx.set_node_attributes(G, labels, 'label')
        # initialize distribution graphs
        distribution = [nx.Graph() for cluster in cluster_centers]
        for node in labels:
            cluster = labels[node]
            distribution[cluster].add_node(node)
        '''
        # connect each node with his the cluster center
        node_list = []
        for index, node in enumerate(G):
            node_list.append(node)
            labels = nx.get_node_attributes(G, 'label')
            label = labels[node]
            if label is not 'water tower':
                G.add_edge(node, cluster_centers[label])
        '''
        for dist_graph in distribution:
            self.complete_graph(dist_graph)
            dist_graph = nx.minimum_spanning_tree(dist_graph, weight='dist')
            self.acqueduct.add_edges_from(dist_graph.edges())
Example #42
0
# coding=utf-8
import numpy as np
import sklearn.cluster as sc
import matplotlib.pyplot as mp

x = np.loadtxt("../ml_data/multiple3.txt", delimiter=",")

bw = sc.estimate_bandwidth(x, n_samples=len(x), quantile=0.2)
model = sc.MeanShift(bandwidth=bw, bin_seeding=True)
model.fit(x)
pred_y = model.predict(x)
#获取聚类中心
centers = model.cluster_centers_
print(centers)
n = 500
l, r = x[:, 0].min() - 1, x[:, 0].max() + 1
t, d = x[:, 1].min() - 1, x[:, 1].max() + 1

grid_x, grid_y = np.meshgrid(np.linspace(l, r, n), np.linspace(t, d, n))

grid_xy = np.column_stack((grid_x.ravel(), grid_y.ravel()))

grid_z = model.predict(grid_xy)

# grid_z=np.vstack((grid_x.flatten(),grid_y.flatten()))
#
# grid_z=model.predict(grid_z.T)

grid_z = grid_z.reshape(grid_x.shape)

mp.figure("Kmeans", facecolor="lightgray")
    print(data_frame.columns.values.tolist())  # ['V1', 'V2', 'labels']
    data = data_frame.values
    X = data[:, :2]
    y = data[:, 2]
    # shuffle
    shuffle_indexes = np.random.permutation(len(X))
    X, y = X[shuffle_indexes], y[shuffle_indexes]
    return X, y


if __name__ == '__main__':
    X, y = load_data()
    print(X.shape, y.shape)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    mean_shift = MeanShift(bandwidth=estimate_bandwidth(X))
    mean_shift.fit(X_train)

    y_predict = mean_shift.predict(X_test)
    # 1不变,0和2互换
    y_predict[y_predict == 1] = 1
    y_predict[y_predict == 0] = 3
    y_predict[y_predict == 2] = 0
    y_predict[y_predict == 3] = 2

    score = accuracy_score(y_true=y_test, y_pred=y_predict)
    print(score)  # 0.9966666666666667

    pass
Example #44
0
def color_quantization(path, exe_median_cut=True, plot=True):
	# Read image by cv2
	image = cv2.imread(path)
	image_file_name = os.path.basename(path).split('.')[0]
	
	# Change color space from BGR to RGB to HLS
	image = cv2.cvtColor(image, cv2.COLOR_BGR2HLS)
	image = np.array(image)  # Change image objects into array
	
	# The following bandwidth can be automatically detected using
	image_reshape = image.reshape((image.shape[0] * image.shape[1], image.shape[2]))
	bandwidth = estimate_bandwidth(image_reshape, quantile=0.2, n_samples=500)
	
	ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=10)
	ms.fit(image_reshape)
	labels = ms.predict(image_reshape)
	
	# Normalized color
	label_norm = deepcopy(labels)
	norm = colors.Normalize(vmin=-1., vmax=1.)
	norm.autoscale(label_norm)
	label_norm = norm(label_norm).tolist()

	cluster_centers = ms.cluster_centers_.astype(np.uint8)
	
	labels_unique = np.unique(labels)
	n_clusters_ = len(labels_unique)
	
	image_mean_shift = recreate_image(cluster_centers, labels, image.shape[0], image.shape[1]).astype(np.uint8)
	print("Reduce color through mean-shift: %d" % n_clusters_)
	
	h_o, s_o, v_o = cv2.split(image)
	
	# Normalize color space
	pixel_colors = image.reshape((np.shape(image)[0] * np.shape(image)[1], 3))
	norm = colors.Normalize(vmin=-1., vmax=1.)
	norm.autoscale(pixel_colors)
	pixel_colors = norm(pixel_colors).tolist()
	
	if plot:
		fig = plt.figure(figsize=(25, 25))
		
		ax = fig.add_subplot(2, 3, 1)
		ax.imshow(image)
		ax.axis('off')
		ax.set_title('Original image')
		
		ax = fig.add_subplot(2, 3, 2)
		ax.imshow(image_mean_shift)
		ax.axis('off')
		ax.set_title('Image after mean shift')
		
		ax = fig.add_subplot(2, 3, 4, projection="3d")
		ax.scatter(h_o.flatten(), s_o.flatten(), v_o.flatten(), facecolors=pixel_colors, marker=".")
		ax.set_xlabel("Hue")
		ax.set_ylabel("Saturation")
		ax.set_zlabel("Value")
		ax.set_title('Color space of original images')
		
		ax = fig.add_subplot(2, 3, 5, projection='3d')
		ax.scatter(h_o.flatten(), s_o.flatten(), v_o.flatten(), c=label_norm)
		ax.set_xlabel("Hue")
		ax.set_ylabel("Saturation")
		ax.set_zlabel("Value")
		ax.set_title('Color space after mean-shift')

	# Median cut
	# There is no need to do median cut if there are not too many colors
	if exe_median_cut:
		image_array = Image.fromarray(image_mean_shift.astype(np.uint8))
		image_median_cut_label = np.array(image_array.quantize(colors=10, method=0, kmeans=0, palette=None))
		
		# Calculate the clustered centered
		pixel_class = {index: [] for index in list(set(image_median_cut_label.flatten()))}
		image_median_cut = image_median_cut_label.flatten()
		image_mean_shift_shape = image_mean_shift.reshape(image_mean_shift.shape[0]*image_mean_shift.shape[1],
														  image_mean_shift.shape[2])
		
		for i, j in zip(image_median_cut, image_mean_shift_shape):
			pixel_class[i].append(j)
		
		clustered_center_median_cut = np.array(list({key: tuple(np.average(np.array(value), axis=0).astype(np.uint8))
													 for key, value in pixel_class.items()}.values()))
		
		image_median_cut = recreate_image(clustered_center_median_cut, image_median_cut_label.flatten(),
										  image_mean_shift.shape[0], image_mean_shift.shape[1]).astype(np.uint8)
		
		image_median_cut_h, image_median_cut_l, image_median_cut_s = cv2.split(image_median_cut)

		print("Reduce color through median cut: %d" % len(list(set(image_median_cut_label.flatten()))))
		image_mean_shift = image_median_cut
		
	# K-Means
	image_kmeans = image_mean_shift.reshape((image_mean_shift.shape[0] * image_mean_shift.shape[1], image_mean_shift.shape[2]))
	kmeans = KMeans(n_clusters=3, random_state=0, n_jobs=10).fit(image_kmeans)
	labels = kmeans.predict(image_kmeans)
	print("Reduce color through K-means: %d" % len(list(set(labels.flatten()))))
	
	cluster_centers = kmeans.cluster_centers_.astype(np.uint8)
	image_mean_shift = image_mean_shift.astype(np.uint8)
	k_means_image = recreate_image(cluster_centers, labels, image_mean_shift.shape[0], image_mean_shift.shape[1]).astype(np.uint8)
	h_km, s_km, v_km = cv2.split(k_means_image)

	if plot:
		ax = fig.add_subplot(2, 3, 3)
		ax.imshow(k_means_image)
		ax.axis('off')
		ax.set_title('Image after K-means')

		ax = fig.add_subplot(2, 3, 6, projection='3d')
		ax.scatter(h_km.flatten(), s_km.flatten(), v_km.flatten(), c=labels)
		ax.set_xlabel("Hue")
		ax.set_ylabel("Saturation")
		ax.set_zlabel("Value")

		fig.tight_layout()
		plt.show()
	
	segmentation_image = seperate_layers(k_means_image)
	
	for nl in range(1, len(segmentation_image)):
		current_dir = os.getcwd()
		file_name = path.split('/')[-3]
		image_quantization_result_dir = str(Path(current_dir).parent) + '/image_generator/' + file_name + \
										'/color_quantization_result_batches/' + str(nl) + '_layer/'
		
		if not os.path.exists(image_quantization_result_dir):
			os.makedirs(image_quantization_result_dir)
			print("Directory ", image_quantization_result_dir, " Created ")
		else:
			print("Directory ", image_quantization_result_dir, " already exists")
		
		save_path = image_quantization_result_dir + image_file_name + '.p'
		
		# Save image into pickle file for saving memeory
		with open(save_path, 'wb') as handle:
			pickle.dump(segmentation_image[list(segmentation_image.keys())[nl]], handle, protocol=pickle.HIGHEST_PROTOCOL)
	return segmentation_image
Example #45
0
# stopping criteria
criteria = (cv.TERM_CRITERIA_EPS+cv.TERM_CRITERIA_MAX_ITER,100,0.2)
fig, axs = plt.subplots(1,5,sharey=True,figsize=(10,8))
ctr,k = 0,2
for ax in axs:
    _, labels, (centers) = cv.kmeans(pixel_values, k, None,criteria, 10, cv.KMEANS_RANDOM_CENTERS)
    centers = np.uint8(centers)
    labels = labels.flatten()
    segmented_img = centers[labels.flatten()]
    segmented_img = segmented_img.reshape(img.shape)
    ax.imshow(segmented_img,interpolation='none')
    ax.set_title(["Cluster=",k],fontsize=8)
    ctr+=1
    k+=2
plt.show()
#%% Mean Shift Clustering Based Segmentation
originImg = cv.imread('SunnyLake.bmp')
originShape = originImg.shape   
flatImg=np.reshape(originImg, [-1, 3])    
bandwidth = estimate_bandwidth(flatImg, quantile=0.1, n_samples=100)
ms = MeanShift(bandwidth = bandwidth, bin_seeding=True) 
ms.fit(flatImg)  
labels=ms.labels_   
cluster_centers = ms.cluster_centers_      
labels_unique = np.unique(labels)    
n_clusters_ = len(labels_unique)    
print("number of estimated clusters : %d" % n_clusters_)
segmentedImg = cluster_centers[np.reshape(labels, originShape[:2])]
cv.imshow("Segmented Image",segmentedImg.astype(np.uint8))
cv.waitKey(0)
cv.destroyAllWindows()
Example #46
0
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets.samples_generator import make_blobs

centers = [[1, 1], [-1, -1], [1, -1]]
X,_ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6)


bandwidth = estimate_bandwidth(X, n_samples=500)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)


# Plot result
import matplotlib.pyplot as plt

for k in range(n_clusters_):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.scatter(X[my_members, 0], X[my_members, 1])
    plt.plot(cluster_center[0], cluster_center[1], 'o',
             markeredgecolor='b', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
# iris_data = pd.read_csv('iris_data.csv')
iris_data = pd.read_excel('iris_data.xlsx')
# print(iris_data)

# 2.
iris_data = pd.get_dummies(iris_data, columns=['Species'])
# print(iris_data)

# 3.
virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1]
versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1]
setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1]

# virginica.plot.scatter(x=0, y=1, c='r')
# versicolor.plot.scatter(x=0, y=1, c='b')
# setosa.plot.scatter(x=0, y=1, c='g')

plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r')
plt.scatter(x=versicolor['Sepal length'],
            y=versicolor['Sepal width'],
            color='g')
plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b')
# plt.show()

# 4.
print(estimate_bandwidth(iris_data, quantile=0.2))
analyzer = MeanShift(bandwidth=1)
# print(estimate_bandwidth(virginica, quantile=0.2))
# print(estimate_bandwidth(versicolor, quantile=0.2))
# print(estimate_bandwidth(setosa, quantile=0.2))
Example #48
0
def extract_profiles(global_data,
                     genotype,
                     seq_idx,
                     ref_labels,
                     RG_info,
                     ID_col,
                     subset_col,
                     Names,
                     ref_lib={},
                     n_comps=4,
                     repn=100,
                     code={},
                     others='admx',
                     Sn=500,
                     same=True,
                     clean=False):
    '''
    Extract KDE profiles for specific accessions (global_idx) from reference groups in PCA space. 
    Reduction and KDE calculated at seq_idx positions in genotype array.
    Reference accessions from ref_labels groups are permuted. samp_sample() function is used to 
    sample accessions using RG_info, takes Sm
    '''
    ## estimate the bandwith

    pca2 = PCA(n_components=n_comps, whiten=False, svd_solver='randomized')

    cluster_profiles = {x: [] for x in ref_labels}

    ## perform KDE.
    combine = {}
    tkeys = ref_labels
    var_comp_store = []

    for rp in range(repn):
        print(rp)
        if same:
            Names_idx, kde_class_labels, kde_label_dict, Nsample = samp_same_v2c2(
                ref_lib, Names)
        else:
            Names_idx, kde_class_labels, kde_label_dict, Nsample = samp_sample(
                genotype,
                RG_info,
                ID_col,
                subset_col,
                Names,
                code=code,
                others=others,
                Sn=Sn)

        dat_foc = genotype[:, seq_idx]
        dat_foc = dat_foc[global_data]

        Sequences = genotype[:, seq_idx]
        Sequences = Sequences[Names_idx]

        if Sequences.shape[1] <= 3:
            Results[Chr][c] = [0, 0]
            print('hi')
            continue

        pca2.fit(Sequences)
        data = pca2.transform(Sequences)
        data_ref = pca2.transform(dat_foc)

        local_pcvar = list(pca2.explained_variance_ratio_)

        #local_pcvar= [local_pcvar]

        var_comp_store.append(local_pcvar)

        params = {'bandwidth': np.linspace(np.min(data), np.max(data), 15)}
        grid = GridSearchCV(KernelDensity(algorithm="ball_tree",
                                          breadth_first=False),
                            params,
                            cv=3,
                            iid=False,
                            verbose=0)

        ref_q = []

        for bull in tkeys:

            Quanted_set = data[kde_label_dict[bull], :]
            grid.fit(Quanted_set)
            kde = grid.best_estimator_

            P_dist = kde.score_samples(Quanted_set)
            Fist = kde.score_samples(data_ref)

            if clean:
                pdat = kde.score_samples(data)
                pdat = scipy.stats.norm(np.mean(P_dist),
                                        np.std(P_dist)).cdf(pdat)
                ref_q.append(pdat)

            ## Normalizing log-likelihood estimates by those of the reference set and extracting their cdf.
            Fist = scipy.stats.norm(np.mean(P_dist), np.std(P_dist)).cdf(Fist)

            if not clean:
                cluster_profiles[bull].append(Fist)

        if clean:
            ref_q = np.array(ref_q)
            sidx = ref_q.argsort(axis=0)
            sidx = ref_q[sidx, np.arange(sidx.shape[1])]

            diffs = sidx[-1] - sidx[-2]
            diffs = diffs

            for idx in range(len(tkeys)):
                qt_idx = np.array(kde_label_dict[bull])
                qdiff = diffs[qt_idx]
                qmax = sidx[-1][qt_idx]

                bandwidth = estimate_bandwidth(qdiff.reshape(-1, 1),
                                               quantile=0.2)

                if not bandwidth:
                    continue

                ms = MeanShift(bandwidth=bandwidth,
                               cluster_all=True,
                               min_bin_freq=20,
                               bin_seeding=False)
                ms.fit(qdiff.reshape(-1, 1))
                labels = ms.labels_
                cluster_centers = ms.cluster_centers_
                clust_keep = np.argmax(cluster_centers)

                clust_keep = [
                    qt_idx[x] for x in range(len(qt_idx))
                    if qmax[x] > .01 and labels[x] == clust_keep
                ]

                if len(clust_keep) >= 5:
                    kde_label_dict[bull] = clust_keep

                print(len(qt_idx) - len(clust_keep))

            for bull in tkeys:
                Quanted_set = data[kde_label_dict[bull], :]
                grid.fit(Quanted_set)
                kde = grid.best_estimator_

                P_dist = kde.score_samples(Quanted_set)
                Fist = kde.score_samples(data_ref)

                ## Normalizing log-likelihood estimates by those of the reference set and extracting their cdf.
                Fist = scipy.stats.norm(np.mean(P_dist),
                                        np.std(P_dist)).cdf(Fist)

                cluster_profiles[bull].append(Fist)

    cluster_profiles = {x: np.array(g) for x, g in cluster_profiles.items()}

    cluster_profiles = {
        x: np.median(g, axis=0)
        for x, g in cluster_profiles.items()
    }

    var_comp_store = np.array(var_comp_store)
    var_comp_store = np.median(var_comp_store, axis=0)

    return cluster_profiles, var_comp_store
Example #49
0
def Distance_analysis(SequenceStore,target,refs_lib,DIMr = 'PCA',
                                            Bandwidth_split= 30,
                                            ncomp_local= 4,
                                            clsize= 15):
    Clover= []
    Coordinates= []
    Clusters_coords= []
    PC_var= recursively_default_dict()

    Dist_vectors= []

    Distances= []
    center_distances= []

    Ref_stats= []
    Ref_stats_lib= recursively_default_dict()
    

    for CHR in SequenceStore.keys():
        print('going on CHR: '+ str(CHR))
        for bl in SequenceStore[CHR].keys():

            print('data set: {}'.format(bl))
            ### PCA and MeanShift of information from each window copied from *FM36_Galaxy.py.
            Sequences = SequenceStore[CHR][bl]

            Sequences= np.nan_to_num(Sequences)

            print(Sequences.shape)

            #### Dimensionality reduction

            if DIMr == 'PCA':
                pca = PCA(n_components=ncomp_local, whiten=False,svd_solver='randomized').fit(Sequences)
                data = pca.transform(Sequences)
                PC_var[CHR][bl]= [x for x in pca.explained_variance_]

            if DIMr == 'NMF':
                from sklearn.decomposition import NMF
                data = NMF(n_components=ncomp_local, init='random', random_state=0).fit_transform(Sequences)

            Accurate = []

            params = {'bandwidth': np.linspace(np.min(data), np.max(data),Bandwidth_split)}
            grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

            ######################################
            ####### TEST global Likelihood #######
            ######################################
            Focus_labels = [z for z in it.chain(*refs_lib.values())]

            Who= refs_lib[target]
            Whose_parents= range(sum([len(x) for x in refs_lib.values()]))

            #Refs_local= [x for x in Whose_parents if x not in Who]
            Who_feat= data[Who,:]
            Ref_feat= data[Whose_parents,:]

            #### Normalize by distance between local centroids (to compensate for bias in sampling number).
            #### identify these clusters using MS.
            #### use reference accessions NOT in the cluster identified.
            Dpool= data[[x for x in Whose_parents if x not in Who],:]
            Pdistances= []

            bandwidth = estimate_bandwidth(Dpool, quantile=0.15)
            if bandwidth <= 0:
                bandwidth= .1
            params = {'bandwidth': np.linspace(np.min(Dpool), np.max(Dpool),30)}
            grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

            ## perform MeanShift clustering.
            ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=25)
            ms.fit(Dpool)
            labels1 = ms.labels_
            label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}

            centers= [np.mean(Dpool[label_select[z],:],axis= 0) for z in label_select.keys()]

            #### Data set of evenly sampled data. ##
            ## We'll generate 50 new observations from each cluster identified locally. ##
            N= 50
            Proxy_data= []
            label_select_labels= [z for z in it.chain(*[[x] * len(label_select[x]) for x in label_select.keys()])]
            Center_store= {}
            Proxy_indexes= {}
            distance_vecs= []

            for lab in label_select.keys():
                if len(label_select[lab]) < 3:
                    continue

                Quanted_set= Dpool[label_select[lab],:]

                if np.max(pairwise_distances(Quanted_set,metric= 'euclidean')) <= 1e-3:
                    Extract= Quanted_set[np.random.choice(Quanted_set.shape[0],N),:]
                else:
                    grid.fit(Quanted_set)
                    kde = grid.best_estimator_
                    Extract= kde.sample(N)

                center= np.mean(Extract,axis= 0)
                Center_store[lab]= center
                Proxy_indexes[lab]= [x for x in range((len(Center_store) - 1) * N, len(Center_store) * N)]

                Proxy_data.extend(Extract)

            Proxy_data= np.array(Proxy_data)
            ##### Get pairwise distances between centroids.

            for pair in it.combinations(label_select.keys(),2):
                coordinates= [np.mean(Dpool[label_select[z],:],axis= 0) for z in pair]
                coordinates= np.array(coordinates)
                iu_control= np.triu_indices(2,1)
                MS_pair_dist= pairwise_distances(coordinates,metric= 'euclidean')
                MS_pair_dist= MS_pair_dist[iu_control][0]
                Pdistances.append(MS_pair_dist)
            ## 

            reference_centroid= np.mean(centers,axis= 0)

            proxy_distances= pairwise_distances(reference_centroid.reshape(1,-1), Proxy_data,metric= 'euclidean')
            distances_to_center= pairwise_distances(reference_centroid.reshape(1,-1), Ref_feat,metric= 'euclidean')[0]
            self_distances= pairwise_distances(reference_centroid.reshape(1,-1), Who_feat, metric= 'euclidean')

            centroid= np.mean(Who_feat,axis= 0)
            distances_pairwise= pairwise_distances(centroid.reshape(1,-1), Ref_feat,metric= 'euclidean')[0]

            Distances.append(distances_pairwise)
            distances_pairwise= [(x - np.mean(proxy_distances)) / np.std(proxy_distances) for x in distances_pairwise]
            Clover.append(distances_pairwise)
            print(np.array(Clover).shape)

            FC_stats= [np.mean(proxy_distances),np.std(proxy_distances), np.mean(self_distances), np.std(self_distances)]
            Coord= [[CHR,bl,x] for x in Who]

            Ref_stats.append(FC_stats)
            Ref_stats_lib[CHR][bl]= FC_stats

            center_distances.append(distances_to_center)

            Coordinates.extend(Coord)
            Clusters_coords.append([CHR,bl])
            
            clear_output()

    return Distances, Clover, Ref_stats_lib, Ref_stats, center_distances, Coordinates, Clusters_coords
Example #50
0
def conspicuity_int_glcm(im,
                         mask=None,
                         use_sigmoid=False,
                         morph_proc=True,
                         type='hypo',
                         a=3):
    # im = tools.resize_ND(im, scale=0.5)
    # mask = tools.resize_ND(mask, scale=0.5)
    im = im.copy()
    if mask is None:
        mask = np.ones_like(im)

    if im.max() <= 1:
        im = skiexp.rescale_intensity(im, (0, 1), (0, 255)).astype(np.int)

    glcm = tools.graycomatrix_3D(im, mask=mask)

    min_num = 2 * glcm.mean()
    glcm = np.where(glcm < min_num, 0, glcm)
    diag = np.ones(glcm.shape)
    k = 20
    tu = np.triu(diag, -k)
    tl = np.tril(diag, k)
    diag = tu * tl
    glcm *= diag.astype(glcm.dtype)

    # print 'data from glcm ...',
    data = tools.data_from_glcm(glcm)
    quantiles = [0.2, 0.1, 0.4]
    n_clusters_ = 0
    for q in quantiles:
        # print 'estimating bandwidth ...',
        bandwidth = estimate_bandwidth(data, quantile=q, n_samples=2000)
        if bandwidth == 0:
            continue
        # bandwidth = estimate_bandwidth(data, quantile=0.1, n_samples=2000)
        # print 'meanshift ...',
        ms = MeanShift(bandwidth=bandwidth,
                       bin_seeding=True)  #, min_bin_freq=1000)
        ms.fit(data)
        labels = ms.labels_
        cluster_centers = ms.cluster_centers_
        n_clusters_ = len(np.unique(labels))
        # print q, n_clusters_

        if n_clusters_ > 1:
            break

    # n_clusters_ = 0
    if n_clusters_ > 1:
        # print 'number of estimated clusters : %d' % n_clusters_
        # print 'cluster centers: {}'.format(cluster_centers)
        lab_im = (1 + ms.predict(
            np.array(np.vstack(
                (im.flatten(), im.flatten()))).T).reshape(im.shape)) * mask
    else:  # pokud meanshift najde pouze jeden mode, pouziji jiny pristup
        rvs = tools.analyze_glcm(glcm)
        rvs = sorted(rvs, key=lambda rv: rv.mean())
        lab_im = rvs[0].pdf(im)
        n_clusters_ = len(rvs)

    mean_v = im[np.nonzero(mask)].mean()
    labs = np.unique(lab_im)[1:]
    res = np.zeros_like(lab_im)
    for l in labs:
        tmp = lab_im == l
        mv = im[np.nonzero(tmp)].mean()
        if mv < mean_v:
            res = np.where(tmp, 1, res)

    # plt.figure()
    # plt.subplot(121), plt.imshow(glcm, 'jet')
    # for c in cluster_centers:
    #     plt.plot(c[0], c[1], 'o', markerfacecolor='w', markeredgecolor='k', markersize=8)
    # plt.axis('image')
    # plt.axis('off')
    # plt.subplot(122), plt.imshow(glcm, 'jet')
    # colors = itertools.cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    # for k, col in zip(range(n_clusters_), colors):
    #     my_members = labels == k
    #     cluster_center = cluster_centers[k]
    #     plt.plot(data[my_members, 0], data[my_members, 1], col + '.')
    #     plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor='w', markeredgecolor='k', markersize=8)
    # plt.title('Estimated number of clusters: %d' % n_clusters_)
    # plt.axis('image')
    # plt.axis('off')
    # plt.show()

    # plt.figure()
    # plt.subplot(131), plt.imshow(im, 'gray', interpolation='nearest')
    # plt.subplot(132), plt.imshow(lab_im, 'jet', interpolation='nearest')
    # plt.subplot(133), plt.imshow(res, 'gray', interpolation='nearest')
    # plt.show()

    # thresholding graycomatrix (GCM)
    # c_t = 5
    # thresh = c_t * np.mean(glcm)
    # glcm_t = glcm > thresh
    # glcm_to = skimor.binary_opening(glcm_t, selem=skimor.disk(3))
    #
    # rvs = tools.analyze_glcm(glcm_to)

    # filtering
    # rvs = sorted(rvs, key=lambda rv: rv.mean())
    # im_int = rvs[0].pdf(im)
    # mean_v = rvs[0].mean()

    im_int = res

    a = 20
    c = mean_v / 255
    im_res = conspicuity_processing(im_int,
                                    mask,
                                    use_sigmoid=use_sigmoid,
                                    a=a,
                                    c=c,
                                    sigm_t=0.2,
                                    use_morph=morph_proc,
                                    radius=3)

    return im_res
Example #51
0
from sklearn.datasets import make_blobs
from sklearn.cluster import MeanShift, estimate_bandwidth
import numpy as np

print(__doc__)

# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, _ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6)

# #############################################################################
# Compute clustering with MeanShift

# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

# #############################################################################
# Plot result

plt.figure(1)
Example #52
0
def window_analysis(Windows,
                    ref_labels,
                    labels1,
                    Chr=1,
                    ncomp=4,
                    amova=True,
                    supervised=True,
                    include_who=[],
                    range_sample=[130, 600],
                    rand_sample=0,
                    clsize=15,
                    cl_freqs=5,
                    Bandwidth_split=20):

    kde_class_labels = labels1
    kde_label_dict = {
        z:
        [x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z]
        for z in list(set(kde_class_labels))
    }

    if include_who:
        include = [
            x for x in range(len(kde_class_labels))
            if kde_class_labels[x] in include_who
        ]
        ref_labels = include_who
        kde_class_labels = [kde_class_labels[x] for x in include]

        kde_label_dict = {
            z: [
                x for x in range(len(kde_class_labels))
                if kde_class_labels[x] == z
            ]
            for z in include_who
        }

    if rand_sample:
        sample = rand_sample
        sample_range = [0, sample]
        Freq_extract = {
            Chr: {
                bl: Windows[Chr][bl]
                for bl in np.random.choice(
                    list(Windows[Chr].keys()), sample, replace=True)
            }
        }

    if range_sample:
        sample_range = range_sample
        Freq_extract = {
            Chr: {
                bl: Windows[Chr][bl]
                for bl in list(sorted(Windows[Chr].keys()))
                [sample_range[0]:sample_range[1]]
            }
        }

    Results = {'header': ['Chr', 'window'], 'info': []}

    Frequencies = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []}

    Construct = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []}

    PC_var = {'header': ['Chr', 'window'], 'coords': [], 'info': []}

    pc_density = []
    pc_coords = []

    sim_fst = []

    for c in Freq_extract[Chr].keys():
        Sequences = Windows[Chr][c]

        if Sequences.shape[1] <= 3:
            Results[Chr][c] = [0, 0]
            print('hi')
            continue

        Sequences = np.nan_to_num(Sequences)

        pca = PCA(n_components=ncomp, whiten=False,
                  svd_solver='randomized').fit(Sequences)
        data = pca.transform(Sequences)

        if include_who:
            data = data[include, :]

        ##### PC density
        PC = 0

        pc_places = data[:, PC]

        X_plot = np.linspace(-8, 8, 100)

        kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(
            np.array(pc_places).reshape(-1, 1))

        log_dens = kde.score_samples(X_plot.reshape(-1, 1))

        pc_density.append(np.exp(log_dens))
        pc_coords.append(pc_places)

        PC_var['coords'].append([Chr, c])
        PC_var['info'].append([x for x in pca.explained_variance_])
        ###
        params = {
            'bandwidth': np.linspace(np.min(data), np.max(data),
                                     Bandwidth_split)
        }
        grid = GridSearchCV(KernelDensity(algorithm="ball_tree",
                                          breadth_first=False),
                            params,
                            verbose=0)

        ######################################
        ####### TEST global Likelihood #######
        ######################################
        Focus_labels = list(range(data.shape[0]))

        #### Mean Shift approach
        ## from sklearn.cluster import MeanShift, estimate_bandwidth

        bandwidth = estimate_bandwidth(data,
                                       quantile=0.2,
                                       n_samples=len(Focus_labels))
        if bandwidth <= 1e-3:
            bandwidth = 0.1

        ms = MeanShift(bandwidth=bandwidth,
                       cluster_all=False,
                       min_bin_freq=clsize)
        ms.fit(data[Focus_labels, :])
        labels = ms.labels_

        Tree = {
            x: [Focus_labels[y] for y in range(len(labels)) if labels[y] == x]
            for x in [g for g in list(set(labels)) if g != -1]
        }
        Keep = [x for x in Tree.keys() if len(Tree[x]) > clsize]

        Tree = {x: Tree[x] for x in Keep}
        Ngps = len(Tree)
        SpaceX = {x: data[Tree[x], :] for x in Tree.keys()}

        these_freqs = []
        ### Extract MScluster likelihood by sample

        for hill in SpaceX.keys():

            if len(Tree[hill]) >= cl_freqs:
                if supervised == False:
                    print('hi')
                    cl_seqs = Sequences[Tree[hill], :]

                    freq_vector = [
                        float(x) / (cl_seqs.shape[0] * 2)
                        for x in np.sum(cl_seqs, axis=0)
                    ]

                    Frequencies['coords'].append([Chr, c, hill])
                    Frequencies['info'].append(freq_vector)
                    these_freqs.append(freq_vector)

            grid.fit(data[Tree[hill], :])

            # use the best estimator to compute the kernel density estimate
            kde = grid.best_estimator_

            P_dist = kde.score_samples(data[Tree[hill], :])
            Dist = kde.score_samples(data)
            P_dist = np.nan_to_num(P_dist)
            Dist = np.nan_to_num(Dist)
            if np.std(P_dist) == 0:
                Dist = np.array(
                    [int(Dist[x] in P_dist) for x in range(len(Dist))])
            else:
                Dist = scipy.stats.norm(np.mean(P_dist),
                                        np.std(P_dist)).cdf(Dist)
            Dist = np.nan_to_num(Dist)
            Construct['coords'].append([Chr, c, hill])
            Construct['info'].append(Dist)

            #########################################
        ############# AMOVA ################
        #########################################

        if supervised:
            labels = [x for x in kde_class_labels if x in ref_labels]
            Who = [
                z for z in it.chain(*[kde_label_dict[x] for x in ref_labels])
            ]
            Ngps = len(ref_labels)

            print(ref_labels)
            for hill in ref_labels:

                if len(kde_label_dict[hill]) >= cl_freqs:
                    if include_who:
                        Seq_specific = Sequences[include, :]

                    cl_seqs = Seq_specific[kde_label_dict[hill], :]

                    freq_vector = [
                        float(x) / (cl_seqs.shape[0] * 2)
                        for x in np.sum(cl_seqs, axis=0)
                    ]

                    Frequencies['coords'].append([Chr, c, hill])
                    Frequencies['info'].append(freq_vector)
                    these_freqs.append(freq_vector)

        else:
            Who = [
                x for x in range(len(labels))
                if labels[x] != -1 and labels[x] in Keep
            ]
            labels = [labels[x] for x in Who]
            Who = [Focus_labels[x] for x in Who]

        #
        Pairwise = return_fsts2(np.array(these_freqs))
        sim_fst.extend(Pairwise.fst)

        if len(list(set(labels))) == 1:
            Results['coords'].append([Chr, c])
            Results['info'].append([AMOVA, Ngps])
            continue

        if amova:
            clear_output()
            AMOVA, Cig = AMOVA_FM42(data[Who, :],
                                    labels,
                                    n_boot=0,
                                    metric='euclidean')
            print('counting: {}, Ngps: {}'.format(AMOVA, Ngps))
            Results['info'].append([Chr, c, AMOVA, Ngps])

    Results['info'] = pd.DataFrame(
        np.array(Results['info']),
        columns=['chrom', 'window', 'AMOVA', 'Ngps'])

    X_plot = np.linspace(0, .3, 100)

    freq_kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(
        np.array(sim_fst).reshape(-1, 1))

    log_dens = freq_kde.score_samples(X_plot.reshape(-1, 1))

    fig_roost_dens = [
        go.Scatter(x=X_plot,
                   y=np.exp(log_dens),
                   mode='lines',
                   fill='tozeroy',
                   name='',
                   line=dict(color='blue', width=2))
    ]
    ##

    layout = go.Layout(title='allele frequency distribution across clusters',
                       yaxis=dict(title='density'),
                       xaxis=dict(title='fst'))

    fig = go.Figure(data=fig_roost_dens, layout=layout)

    return Frequencies, sim_fst, Results, Construct, pc_density, pc_coords, fig
iris_data = pd.get_dummies(iris_data, columns=['Species'])
print(iris_data.head())

virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1]
versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1]
setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1]

plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r')
plt.scatter(x=versicolor['Sepal length'],
            y=versicolor['Sepal width'],
            color='g')
plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b')

#plt.show()

print("Self band: ", estimate_bandwidth(iris_data, quantile=0.2))
analyzer = MeanShift(bandwidth=1)
print("Self MeanShift: ", analyzer.fit(iris_data))
print("Function mean_shift: ", mean_shift(iris_data))

labels, cluster_centers, n_clusters = mean_shift(iris_data)
fig = plt.figure()
ax = fig.add_subplot(111)

colors = cycle('bgrcmy')
for k, col in zip(range(n_clusters), colors):

    my_members = (labels == k)
    cluster_center = cluster_centers[k]
    if (my_members == True):
        x, y = iris_data[0], iris_data[1]
Example #54
0
    'eps': .15,
    'n_neighbors': 2
}), (blobs, {}), (no_structure, {})]

for i_dataset, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algo_params)

    X, y = dataset

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X,
                                    n_neighbors=params['n_neighbors'],
                                    include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # ============
    # Create cluster objects
    # ============
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                           linkage='ward',
Example #55
0
def various_algorithm_launch(samples, pits, nb_clusters, target):

    np.random.seed(0)

    colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
    colors = np.hstack([colors] * 20)

    clustering_names = [
        'Kmeans','MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
        'SpectralClustering', 'Ward', 'AgglomerativeClustering',
        'DBSCAN', 'Birch']


    plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
    plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
                        hspace=.01)

    plot_num = 1
    full_result=pd.DataFrame()
    datasets = [samples]

    for i_dataset, dataset in enumerate(datasets):
        
        if i_dataset==len(datasets)-1:
            print i_dataset
            X=samples
            y=pits
        else:
            X, y = dataset
        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(X, n_neighbors=5, include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)


        # create clustering estimators
        ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        two_means = cluster.MiniBatchKMeans(n_clusters=nb_clusters)
        k_means = cluster.KMeans(init='k-means++', n_clusters=nb_clusters, n_init=20, algorithm='full')
        ward = cluster.AgglomerativeClustering(n_clusters=nb_clusters, linkage='ward', connectivity=connectivity)
        spectral = cluster.SpectralClustering(n_clusters=nb_clusters, eigen_solver='arpack', affinity="nearest_neighbors")
        dbscan = cluster.DBSCAN(eps=0.3, min_samples=10).fit(samples)
        affinity_propagation = cluster.AffinityPropagation(damping = 0.5, max_iter = 200, convergence_iter = 15, copy = True, preference = None, affinity = 'euclidean', verbose = False)
        average_linkage = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=nb_clusters, connectivity=connectivity)
        birch = cluster.Birch(n_clusters=nb_clusters)


        clustering_algorithms = [
            k_means,two_means, affinity_propagation, ms, spectral, ward, average_linkage,
            dbscan, birch]

        for name, algorithm in zip(clustering_names, clustering_algorithms):
            # predict cluster memberships
            t0 = time.time()
            algorithm.fit(X)
            t1 = time.time()
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)

            classif_df=pd.DataFrame(y_pred,index=np.arange(1,len(y_pred)+1))
            classif_df.columns = [name]
            pit_df=pd.DataFrame(pits,index=np.arange(1,len(pits)+1))
            pit_df.columns = [target]
            result_per_pit = pd.concat([classif_df, pit_df], axis=1,verify_integrity=False)
            full_result = pd.concat([full_result, result_per_pit[name]], axis=1)


            #plot
            plt.subplot(1, len(clustering_algorithms), plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)
            plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)

            if hasattr(algorithm, 'cluster_centers_'):
                centers = algorithm.cluster_centers_
                center_colors = colors[:len(centers)]
                plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
            plt.xlim(-2, 2)
            plt.ylim(-2, 2)
            plt.xticks(())
            plt.yticks(())
            plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                     transform=plt.gca().transAxes, size=15,
                     horizontalalignment='right')
            plot_num += 1

    # print full_result
    # plt.show()
    return full_result
Example #56
0
r = requests.get('https://github.com/probml/probml-data/blob/main/data/bread.jpg?raw=true', stream=True)
image = Image.open(io.BytesIO(r.content))

# Image is (687 x 1025, RGB channels)
image = np.array(image)
original_shape = image.shape

# Flatten image.
X = np.reshape(image, [-1, 3])

plt.figure()
plt.imshow(image)
plt.axis('off')
pml.savefig('meanshift_segmentation_input.pdf')

bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=100)
print("bandwidth {}".format(bandwidth))

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)

labels = ms.labels_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

segmented_image = np.reshape(labels, original_shape[:2])  # Just take (height, width), ignore color dim.

plt.figure()
plt.imshow(segmented_image)
import matplotlib.pyplot as plt

input_file = 'wholesale.csv'
file_reader = csv.reader(open(input_file, 'rt'), delimiter=',')
X = []
for count, row in enumerate(file_reader):
    if not count:
        names = row[2:]
        continue

    X.append([float(x) for x in row[2:]])

X = np.array(X)

# Estimating the bandwidth
bandwidth = estimate_bandwidth(X, quantile=0.8, n_samples=len(X))

# Compute clustering with MeanShift
meanshift_estimator = MeanShift(bandwidth=bandwidth, bin_seeding=True)
meanshift_estimator.fit(X)
labels = meanshift_estimator.labels_
centroids = meanshift_estimator.cluster_centers_
num_clusters = len(np.unique(labels))

print("Number of clusters in input data =", num_clusters)

print("Centroids of clusters:")
print('\t'.join([name[:3] for name in names]))
for centroid in centroids:
    print('\t'.join([str(int(x)) for x in centroid]))
def tracker(path):
	#initialization for default value
	if path=='0':
		path=0;
	
	cap = cv2.VideoCapture(path)
	ip_method = ip.get_instace(ip.IPMethod.TOMASI);
	
	#FLANN Properties
	MIN_FRAMES_COUNT = 120
	SKIP_FRAMES = 60
	MIN_MERGE_FRAMES = 5;
	FLANN_INDEX_KDTREE = 0
	index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 10)
	search_params = dict(checks = 50)
	flann = cv2.FlannBasedMatcher(index_params, search_params)
	DO_RESIZE=False
	new_sz = (180,120)
	#Initialization of inputs
	frames =[];					#Frames
	kp = [];					#Key points
	all_matches = []; 			#All good matches
	match_count = [];			#match_count
	labels = [];
	frame_cnt=0;	
	
	print "Extracting frames...................."
	ret, prev_frame = cap.read()
	kp1,desc1 = ip_method.detectAndCompute(prev_frame);
	num_matches = np.zeros(kp1.__len__())
	
	#storing frames
	frames.append(prev_frame);
	kp.append(kp1)
	match_count.append(num_matches);
	
	
	while(cap.isOpened()):
		SKIP_FRAMES=SKIP_FRAMES-1;
		ret, prev_frame = cap.read()
		if not ret or SKIP_FRAMES<0:
			break;
	
	while(cap.isOpened()):
			
		ret, cur_frame = cap.read()
		if not ret:
			break;
		kp2,desc2 = ip_method.detectAndCompute(cur_frame);
		matches = flann.knnMatch(desc1,desc2,k=	2)		
		# Ratio test as per Lowe's paper 
		good_matches = []; distances = []
		for (m,n) in matches:
			
			if m.distance < 0.7*n.distance and m.distance > 4:
				good_matches.append(m);
				distances.append(m.distance);
				
		# Bashart's Displacement filtering
		mean = np.mean(distances); std = np.std(distances)
		good_matches[:] = [match for match in good_matches if abs(match.distance - mean) <  5 * std]
		kp1 = kp2; desc1 = desc2;
		
		num_matches = np.zeros(kp1.__len__())
		for match in good_matches:
			num_matches[match.trainIdx]=match_count[-1][match.queryIdx]+1
	
		all_matches.append(good_matches);		
		
		#storing frames
		frames.append(cur_frame);
		kp.append(kp1)
		match_count.append(num_matches);
		
		if frame_cnt > MIN_FRAMES_COUNT:
			break;
		frame_cnt = frame_cnt +1;
	cap.release()
	
	
	print "Labeling the keypoints................."
	max_label=0;
	MIN_POINTS_TO_CLUSTER = 20
	MAX_CLUSTERS = 100
	#Forward Labeling Pass
	for rng in xrange(0,MIN_MERGE_FRAMES+1):
		labels.append([-1]*kp[rng].__len__());
	for rng in xrange(MIN_MERGE_FRAMES+1,frame_cnt):
		motion_feats = []; feat_indices = [];
		labels.append([-1]*kp[rng].__len__());
		for match in all_matches[rng-1]:
			if match_count[rng-1][match.queryIdx]>=MIN_MERGE_FRAMES: 
				if labels[rng-1][match.queryIdx]==-1:
					src_pt = np.int32(kp[rng-1][match.queryIdx].pt)
					dst_pt = np.int32(kp[rng][match.trainIdx].pt)
					motion_feats.append(motion.get_features(src_pt,dst_pt));
					feat_indices.append(match.trainIdx)
				else :
					labels[rng][match.trainIdx]=labels[rng-1][match.queryIdx]
		
		if(motion_feats.__len__()>=MIN_POINTS_TO_CLUSTER):
			#Clustering mean-shift
			motion_feats = np.asarray(motion_feats)
			bandwidth = estimate_bandwidth(motion_feats, quantile=0.1,random_state=200)
			ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
			ms.fit(motion_feats);
			for idx,lbl in zip(feat_indices,ms.labels_):
				labels[rng][idx]=lbl+max_label;
			max_label = max(labels[rng])+1;
	
	
	random_colors = np.random.randint(256, size=(MAX_CLUSTERS, 3))
	print "Writing the video................."
	fourcc = cv2.cv.CV_FOURCC(*'XVID')
	w = prev_frame.shape[0]; h = prev_frame.shape[1]
	if DO_RESIZE:
		vidout = cv2.VideoWriter('out.avi',fourcc,20,new_sz)
	else:
		vidout = cv2.VideoWriter('out.avi',fourcc,20,(h,w))
	for frame_idx in xrange(MIN_MERGE_FRAMES*2,frame_cnt):
		cur_frame = frames[frame_idx];
		for rng in xrange(frame_idx-MIN_MERGE_FRAMES,frame_idx):
			for match in all_matches[rng-1]:
				if match_count[rng-1][match.queryIdx]>=MIN_MERGE_FRAMES \
						and not (labels[rng-1][match.queryIdx]==-1 or labels[rng-1][match.queryIdx]>=MAX_CLUSTERS):
					#print "i m not here"
					src_pt = np.int32(kp[rng-1][match.queryIdx].pt)
					dst_pt = np.int32(kp[rng][match.trainIdx].pt)
					color = tuple(random_colors[labels[rng-1][match.queryIdx]])
					cv2.line(cur_frame,tuple(src_pt),tuple(dst_pt),color,2);	
		if DO_RESIZE:
			cur_frame=cv2.resize(cur_frame,new_sz);
		vidout.write(cur_frame);
	vidout.release()
	cv2.destroyAllWindows()
                    np.mean([Merged_Pvalues[Chr][z][aim][ind] for z in bl])
                    for ind in norm_labels[gp]
                ]

                SummedP = np.mean(SummedP)
                stat.append(SummedP)

            ## average estimates across groups
            stat = np.mean(stat)

            Genes.append(stat)
            Genes_to_state[gen][aim] = stat

        X_plot = np.linspace(0, max(Genes), 200)[:, np.newaxis]
        bandwidth = estimate_bandwidth(np.array(Genes).reshape(-1, 1),
                                       quantile=0.15,
                                       n_samples=len(Genes))

        fig = plt.figure(figsize=figsize)
        ax = fig.add_subplot(111)

        kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(
            np.array(Genes).reshape(-1, 1))
        log_dens = kde.score_samples(X_plot)
        ax.plot(X_plot[:, 0],
                np.exp(log_dens),
                '-',
                label='label= {0}, band= {1}'.format(aim, bandwidth))
        ax.legend(loc='upper right')

        filename = Home + 'GeneValues_labs' + ''.join(
# Fit and predict the data
birch.fit(scaledData)
predictions = birch.predict(scaledData)

# Scatterplot between two features to check the clustering
plt.scatter(scaledData[:, 2], scaledData[:, 6], c=predictions)
plt.xlabel("Height")
plt.ylabel("Shell weight")
plt.title("Clustering using Birch clustering algorithm")
plt.show()

##################################### Mean Shift Clustering #################################

# Determine optimal bandwidth value
bandwidth = estimate_bandwidth(scaledData, quantile=0.2, n_samples=500)

# Instantiate the clustering model
mnShift = MeanShift(bandwidth=bandwidth)

# Fit and predict the data
mnShift.fit(scaledData)
predictions = mnShift.predict(scaledData)

# Scatterplot between two features to check the clustering
plt.scatter(scaledData[:, 2], scaledData[:, 6], c=predictions)
plt.xlabel("Height")
plt.ylabel("Shell weight")
plt.title("Clustering using Mean shift clustering algorithm")
plt.show()