def meanShift(flat_image):
    # Estimate Bandwidth
    bandwidth = estimate_bandwidth(flat_image, quantile = 0.2, n_samples=500)
    ms = MeanShift(bandwidth, bin_seeding=True)
    ms.fit(flat_image)
    labels = ms.labels_
    return ms.labels_, ms.cluster_centers_
 def meanshift_for_hough_line(self):
     # init mean shift
     pixels_of_label = {}
     points_of_label = {}
     for hough_line in self.points_of_hough_line:
         pixels = self.pixels_of_hough_line[hough_line]
         pixels = np.array(pixels)
         bandwidth = estimate_bandwidth(pixels, quantile=QUANTILE, n_samples=500)
         if bandwidth == 0:
             bandwidth = 2
         ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
         ms.fit(pixels)
         labels = ms.labels_
         labels_unique = np.unique(labels)
         n_clusters_ = len(labels_unique)
         for k in range(n_clusters_):
             label = list(hough_line)
             label.append(k)
             pixels_of_label[tuple(label)] = map(tuple, pixels[labels==k])
     for label in pixels_of_label:
         pixels = pixels_of_label[label]
         points = map(self.img.get_bgr_value, pixels)
         points_of_label[label] = points
     self.pixels_of_hough_line = pixels_of_label
     self.points_of_hough_line = points_of_label
Example #3
0
    def _fit_mean_shift(self, x):
        for c in xrange(len(self.crange)):
            quant = 0.015 * (c + 1)
            for r in xrange(self.repeats):
                bandwidth = estimate_bandwidth(
                    x, quantile=quant, random_state=r)
                idx = c * self.repeats + r
                model = MeanShift(
                    bandwidth=bandwidth, bin_seeding=True)
                model.fit(x)
                self._labels[idx] = model.labels_
                self._parameters[idx] = model.cluster_centers_

                # build equivalent gmm
                k = model.cluster_centers_.shape[0]
                model_gmm = GMM(n_components=k, covariance_type=self.cvtype,
                                init_params='c', n_iter=0)
                model_gmm.means_ = model.cluster_centers_
                model_gmm.weights_ = sp.array(
                    [(model.labels_ == i).sum() for i in xrange(k)])
                model_gmm.fit(x)

                # evaluate goodness of fit
                self._ll[idx] = model_gmm.score(x).sum()
                if self.gof_type == 'aic':
                    self._gof[idx] = model_gmm.aic(x)
                if self.gof_type == 'bic':
                    self._gof[idx] = model_gmm.bic(x)

                print quant, k, self._gof[idx]
    def cluster_pixels_ms(self):
        # reshape
        """
        cluster points descriptors by meahs shift
        :type self: ColorRemover
        """
        fg_pixels = self.img.fg_pixels.keys()
        descriptors = []
        for r, c in fg_pixels:
            descriptors.append(self.descriptor_map[r][c])
        descriptors = np.array(descriptors)
        descriptors = PCA(n_components=int(VECTOR_DIMENSION)/2).fit_transform(descriptors)
        # descriptors = self.descriptor_map.reshape(descriptors_rows, 1, VECTOR_DIMENSION)
        bandwidth = estimate_bandwidth(descriptors, quantile=0.05)
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        ms.fit(descriptors)
        labels = ms.labels_

        for i in range(len(labels)):
            xy = fg_pixels[i]
            label = labels[i]
            self.labels_map.itemset(xy, label)
        # save the indices and BGR values of each cluster as a dictionary with keys of label
        for label in range(K):
            self.pixels_of_hough_line_in_sphere[label] = map(tuple, np.argwhere((self.labels_map == label)))
            self.cluster_bgr[label] = map(tuple, self.img.bgr[self.labels_map == label])
Example #5
0
    def get_clusters(self, in_file, cc_file, clf_file, arrivals_file, chunk_size=1710671):
        df = pd.read_csv(open(in_file), chunksize=chunk_size)
        dests = []
        part = 1
        lines = 1710671 / chunk_size
        try:
            dest = cPickle.load(open(arrivals_file))
        except IOError:
            for d in df:
                print "%d / %d" % (part, lines)
                part += 1
                for row in d.values:
                    # print eval(row[-1])
                    tmp = eval(row[-1])
                    if len(tmp) > 0:
                        dests.append(tmp[-1])
            dest = np.array(dests)
            cPickle.dump(dest, open(arrivals_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL)
        print "Destination points loaded"

        try:
            ms = cPickle.load(open(clf_file))
        except IOError:
            bw = 0.001
            ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5, n_jobs=-2)
            ms.fit(dest)
            cPickle.dump(ms, open(clf_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL)
        print "Mean shift loaded"
        cluster_centers = ms.cluster_centers_
        cPickle.dump(cluster_centers, open(cc_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL)
        print "Clusters dumped"
Example #6
0
def applyMeanShift(data,quantileValue=0.2,clusterall=False):
	result=[]
	n_samples=len(data)
	print "Nombre de points du dataset: %d" %n_samples
	
	bandwidth = estimate_bandwidth(data, quantile=quantileValue)
	ms = MeanShift(bandwidth=bandwidth,cluster_all=clusterall)
	#Applique le MeanShift
	clustereddata=ms.fit(data)
	clusteredlabels= clustereddata.labels_
	barycenters=ms.cluster_centers_

	labels_unique = np.unique(clusteredlabels)
	nbOfClusters = len(labels_unique)

	print "number of estimated clusters : %d" % nbOfClusters

	for i in labels_unique:
		print "###Indices des points du cluster %d : ###" %i
		# print [indice[0] for indice in np.argwhere(clusteredlabels == i)]
		result.append([indice[0] for indice in np.argwhere(clusteredlabels == i)])
	#Add a zero coordinates vector to takeinto account the fact that -1 "cluster" does not have a barycenter
	if -1 in labels_unique:
		barycenters= np.append([[0 for k in range(len(barycenters[0]))]],barycenters,axis=0)

	return [result,barycenters]
Example #7
0
def meanShift(mtx, **kw):
    """
    meanShift(mtx, **kw) uses scikit-learn's meanshift clustering implementation to
    cluster infoDistance matrices.

    Call with the distance matrix as the first parameter. 
        Available Keyword arguments:
        startingbandwidth:  the lowest bandwidth to begin the estimation with (defaults to 0.1)
        bandwithincrement:  the amount by which to increment bandwidth in between rounds of
                            meanshift (defaults to 0.01)
    """
    H = kw.get('startingbandwidth', 0.1)
    dH= kw.get('bandwidthincrement', 0.01)
    ms = MeanShift(bandwidth = H)
    clustercenters = None
    nnonunary = []
    minH = None
    while nclusters > 1:
        ms = MeanShift(bandwidth = H)
        ms.fit(mtx)
        centers   = ms.cluster_centers_
        clusters  = ms.labels_
        nonunary  = np.shape(np.where(np.bincount(clusters) > 1))[1]
        if nonunary:
            H = H + dH
def run_mean_shift(df):
    '''
    INPUTS: Pandas Dataframe
    OUTPUTS: Returns a fitted MeanShift object
    '''
    model = MeanShift(min_bin_freq=10, cluster_all=False, n_jobs=-1)
    return model.fit(df)
Example #9
0
def hart85_means_shift_cluster(pair_buffer_df, features):

    from sklearn.cluster import MeanShift, estimate_bandwidth

    # Creating feature vector
    cluster_df = pd.DataFrame()
    if 'active' in features:
        cluster_df['active'] = pd.Series(pair_buffer_df.apply(lambda row:
                                                                   ((np.fabs(row['T1 Active']) + np.fabs(row['T2 Active'])) / 2), axis=1), index=pair_buffer_df.index)
    if 'reactive' in features:
        cluster_df['reactive'] = pd.Series(pair_buffer_df.apply(lambda row:
                                                                     ((np.fabs(row['T1 Reactive']) + np.fabs(row['T2 Reactive'])) / 2), axis=1), index=pair_buffer_df.index)
    if 'delta' in features:
        cluster_df['delta'] = pd.Series(pair_buffer_df.apply(lambda row:
                                                                  (row['T2 Time'] - row['T1 Time']), axis=1), index=pair_buffer_df.index)
        cluster_df['delta'] = cluster_df[
            'delta'].apply(lambda x: int(x) / 6e10)

    if 'hour_of_use' in features:
        cluster_df['hour_of_use'] = pd.DatetimeIndex(
            pair_buffer_df['T1 Time']).hour

    if 'sd_event' in features:
        cluster_df['sd_event'] = pd.Series(pair_buffer_df.apply(lambda row:
                                                                     (df.power[row['T1 Time']:row['T2 Time']]).std(), axis=1), index=pair_buffer_df.index)

    X = cluster_df.values.reshape((len(cluster_df.index), len(features)))
    ms = MeanShift(bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    return pd.DataFrame(cluster_centers, columns=features)
Example #10
0
def mean(X, save_fig=False, params_labels=None, prefix='clusters'):
    '''
    Compute clustering with MeanShift
    '''
    logger.debug('Calculating MeanShift clusters using %d parameters'%len(X[0]))
    
    X = np.array( X )
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bandwidth = estimate_bandwidth(X, quantile=0.2)
    
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        ms.fit(X)
        
    labels = ms.labels_
    
    if save_fig:
        plotClusters(X, ms, method='mean', prefix=prefix,
                     params=params_labels)
    
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    
    logger.debug('Found %d clusters with MeanShift algorithm'%n_clusters_)
    
    return labels
def find_clusters(feature, items, bandwidth=None, min_bin_freq=None, cluster_all=True, n_jobs=1):
    """
    Cluster list of items based on feature using meanshift algorithm (Binning).

    :param feature: key used to retrieve item to cluster on
    :param items:
    :param bandwidth:
    :param min_bin_freq:
    :param cluster_all:
    :return:
    """
    x = [item[feature] for item in items]
    X = np.array(list(zip(x, np.zeros(len(x)))), dtype=np.float)
    ms = MeanShift(bandwidth=bandwidth, min_bin_freq=min_bin_freq, cluster_all=cluster_all, n_jobs=n_jobs)
    ms.fit(X)

    labels = ms.labels_
    labels_unique = np.unique(labels)

    n_clusters_ = len(labels_unique)

    clusters = []

    for k in range(n_clusters_):
        if k != -1:
            my_members = labels == k
            cluster_center = np.median(X[my_members, 0])
            cluster_sd = np.std(X[my_members, 0])
            clusters.append({
                'center': cluster_center,
                'sd': cluster_sd,
                'items': X[my_members, 0]
            })

    return clusters
Example #12
0
def Mean_Shift(path):
    #importer les donnees
    data=pandas.read_csv(filepath_or_buffer=path,delimiter=',',encoding='utf-8')  
    data.drop_duplicates()
    print (data)
    #lire les donnees
    values=data[['latitude', 'longitude']].values
    print("printing values")
    print (values)
    #Mean shift
    print ("Clustering data Meanshift algorithm")
    bandwidth = estimate_bandwidth(values, quantile=0.003, n_samples=None)
    #ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=20, cluster_all=False)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True,min_bin_freq=25,cluster_all=False)
    ms.fit(values)
    data['cluster'] = ms.labels_
    data = data.sort(columns='cluster')
    data = data[(data['cluster'] != -1)]
    print (data['cluster'])
    data['cluster'] = data['cluster'].apply(lambda x:"cluster" +str(x))
    labels_unique = np.unique(ms.labels_).tolist()
    del labels_unique[0]
    # Filtering clusters centers according to data filter
    cluster_centers = DataFrame(ms.cluster_centers_, columns=['latitude', 'longitude'])
    cluster_centers['cluster'] = labels_unique
    print (cluster_centers)
    n_centers_ = len(cluster_centers)
    print("number of clusters is :%d" % n_centers_)
    # print ("Exporting clusters to {}...'.format(clusters_file)")
    data.to_csv(path_or_buf="output/points.csv", cols=['user','latitude','longitude','cluster','picture','datetaken'], encoding='utf-8')
    #print ("Exporting clusters centers to {}...'.format(centers_file)")
    cluster_centers['cluster'] = cluster_centers['cluster'].apply(lambda x:"cluster" +str(x))
    cluster_centers.to_csv(path_or_buf="output/centers.csv", cols=['latitude', 'longitude','cluster'], encoding='utf-8')
    plot_meanshift(data, cluster_centers, n_centers_)
    return 0
    def CombinedMeanShift(self, h, alpha,
                          PrincComp=None,
                          njobs=-2,
                          mbf=1):
        """Performs the scikit-learn Mean Shift clustering.

        Arguments:

        h -- the bandwidth
        alpha -- the weight of the principal components as compared
        to the spatial data.
        PrincComp -- used to pass already-computed principal components
        njobs -- the number of processes to be used (default: n. of CPU - 1)
        mbf -- the minimum number of items in a seed"""

        MS = MeanShift(bin_seeding=True, bandwidth=h, cluster_all=True,
                       min_bin_freq=mbf, n_jobs=njobs)
        if PrincComp is None:
            PrincComp = self.ShapePCA(2)
        print("Starting sklearn Mean Shift... ")
        stdout.flush()
        fourvector = np.vstack((self.__data, alpha * PrincComp))
        MS.fit_predict(fourvector.T)
        self.__ClusterID = MS.labels_
        self.__c = MS.cluster_centers_.T
        print("done.")
        stdout.flush()
def meanShift(points):
  # perform meanshift clustering of data
  meanshift = MeanShift()
  meanshift.fit(points.T)
  labels = meanshift.labels_
  centers = meanshift.cluster_centers_
  return np.array(labels)
Example #15
0
def cluster_data(data,clustering_method,num_clusters):
    cluster_centers = labels_unique = labels = extra = None
    if clustering_method == 'KMeans':
        # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
        k_means = KMeans(n_clusters=num_clusters,init='k-means++',n_init=10,max_iter=100,tol=0.0001,
                                precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1)
        k_means.fit(data)
        labels = k_means.labels_
        cluster_centers = k_means.cluster_centers_
    elif clustering_method == 'MeanShift':
        ms =  MeanShift( bin_seeding=True,cluster_all=False)
        ms.fit(data)
        labels = ms.labels_
        cluster_centers = ms.cluster_centers_
    elif clustering_method == 'AffinityPropagation':
        af = AffinityPropagation().fit(data)
        cluster_centers = [data[i] for i in  af.cluster_centers_indices_]
        labels = af.labels_
    elif clustering_method == "AgglomerativeClustering":
        n_neighbors=min(10,len(data)/2)
        connectivity = kneighbors_graph(data, n_neighbors=n_neighbors)
        ward = AgglomerativeClustering(n_clusters=num_clusters, connectivity=connectivity,
                               linkage='ward').fit(data)
        labels = ward.labels_
    elif clustering_method == "DBSCAN":
        db = DBSCAN().fit(data)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        extra = core_samples_mask
        labels = db.labels_

    if labels is not None:
        labels_unique = np.unique(labels)
    return labels,cluster_centers,labels_unique,extra
Example #16
0
def simplify_data1(x):
	X = np.array(zip(x,np.zeros(len(x))), dtype=np.float)
	bandwidth = estimate_bandwidth(X, quantile=0.2)
	ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
	ms.fit(X)
	labels = ms.labels_
	cluster_centers = ms.cluster_centers_
	labels_unique = np.unique(labels)
	n_clusters_ = len(labels_unique)
	#print n_clusters_
	#exit()
	start=0
	value=0
	print x
	for k in range(n_clusters_):
	    my_members = labels == k
	    print "cluster {0}: {1}".format(k, X[my_members, 0]),np.average(X[my_members, 0])
	    value=np.average(X[my_members, 0])
	    val2=0
	    for i in xrange(start,start+len(X[my_members, 0])):
		val2+=X[i][0]
		print val2,X[i][0],i
		X[i][0]=value
	    print "FINAL",val2/len(X[my_members, 0])
	    start+=len(X[my_members, 0])
	return X[:,0]
def mean_shift_cluster_analysis(x,y,quantile=0.2,n_samples=1000):
    # ADAPTED FROM:
    # http://scikit-learn.org/stable/auto_examples/cluster/plot_mean_shift.html#example-cluster-plot-mean-shift-py
    # The following bandwidth can be automatically detected using
    X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1))))
    bandwidth = estimate_bandwidth(X, quantile=quantile, n_samples=n_samples)
    
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    
    #print("number of estimated clusters : %d" % n_clusters_)
    colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for i in xrange(len(np.unique(labels))):
        my_members = labels == i
        cluster_center = cluster_centers[i]
        plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7)
        plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i])
    tolx = (X[:,0].max()-X[:,0].min())*0.03
    toly = (X[:,1].max()-X[:,1].min())*0.03
    plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx)
    plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly)
    plt.show()
    return labels
Example #18
0
File: id3.py Project: palmagro/mrrf
 def centers_y_clusters(self,graph_db,nodes,consulta,cyprop):
     group = []
     todo = []
     rr = []
     for n in nodes:
         tiene = neo4j.CypherQuery(graph_db, consulta+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute()
         for r in tiene:
             todo.append([r.cuenta])
             rr.append(r.cuenta)
         
     ms = MeanShift(bin_seeding=True)
     ms.fit(np.asarray(todo))
     labels = ms.labels_
     cluster_centers = sorted(ms.cluster_centers_ , key=lambda x: x[0])
     for idx,cl in enumerate(cluster_centers):
         cluster_centers[idx] = float(cl[0])
     for u in cluster_centers:
         group.append([])
     for n in nodes:
         tiene = neo4j.CypherQuery(graph_db, consulta+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute()
         for r in tiene:
             valor = r.cuenta
         for idx,v in enumerate(cluster_centers):
             if idx == 0:
                 temp1 = -9999
             else:
                 temp1 = (cluster_centers[idx-1] + cluster_centers[idx])/2
             if idx == len(cluster_centers) - 1:
                 temp2 = 99999
             else:
                 temp2 = (cluster_centers[idx+1] + cluster_centers[idx])/2
             if temp1 <= valor < temp2:
                 group[idx].append(n)
     return cluster_centers, group
Example #19
0
def BA_meanshift_cluster(mark, chrom):
    '''
    @param:
    @return:
    perform mean shift cluster on 2D data:
        ((chromStart+chromEnd)*0.5, chromEnd-chromStart)
    '''
    path = os.path.join(get_data_dir(), "tmp", mark,"{0}-{1}.csv".format(chrom, mark))
    DF = pd.read_csv(path, sep='\t')
    S_x = 0.5*(DF.loc[:, 'chromEnd'].values+DF.loc[:, 'chromStart'].values)
    S_y = DF.loc[:, 'chromEnd'].values-DF.loc[:, 'chromStart'].values
    X = np.hstack((np.atleast_2d(S_x[7000:8000]).T, np.atleast_2d(S_y[7000:8000]).T))
    print X
    bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=1000)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    print list(set(labels))
    import matplotlib.pyplot as plt
    from itertools import cycle
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(len(list(set(labels)))), colors):
        my_members = labels == k
        plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.title('Estimated number of clusters: %d' % len(list(set(labels))))
    plt.show()
def make(filename, precision):
    with open('test.geojson') as f:
        data = json.load(f)

    features = data['features']
    points = [
        geo['geometry']["coordinates"]
        for geo in features if pred(geo)
    ]
    print points
    ar_points = array(points).reshape(len(points) * 2, 2)
    print ar_points
    bandwidth = estimate_bandwidth(ar_points) / precision
    cluster = MeanShift(bandwidth=bandwidth)
    cluster.fit(ar_points)
    labels = cluster.labels_
    cluster_centers = cluster.cluster_centers_
    print 'clusters:', len(unique(labels))

    for i, geo in enumerate(filter(pred, features)):
        geo['geometry']["coordinates"] = [
            list(cluster_centers[labels[i*2 + j]])
            for j in range(2)
        ]

    with open(filename, 'w') as f:
        json.dump(data, f)
def do_meanshift(s_path, band1, band2, band3, band4, colour1, colour2,
                 make_plot):
    '''Meanshift clustering to determine the number of clusters in the
        data, which is passed to KMEANS function'''
    # Truncate data
    X = np.vstack([colour1, colour2]).T
    '''Compute clustering with MeanShift'''
    # Scale data because meanshift generates circular clusters
    X_scaled = preprocessing.scale(X)
    # The following bandwidth can be automatically detected using
    # the routine estimate_bandwidth(X). Bandwidth can also be set manually.
    bandwidth = estimate_bandwidth(X)
    #bandwidth = 0.65
    # Meanshift clustering
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
    ms.fit(X_scaled)
    labels_unique = np.unique(ms.labels_)

    objects = ms.labels_[ms.labels_ >= 0]
    n_clusters = len(labels_unique[labels_unique >= 0])
    # Make plot
    if "meanshift" in make_plot:
        make_ms_plots(s_path, colour1, colour2, n_clusters, X, ms,
                      band1, band2, band3, band4, objects)
    return(n_clusters, bandwidth)
def ms_algo(X, bandwidth=None):
    if(bandwidth==None):
        n_samples = X.shape[0]
        bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=n_samples)

    # Apply the meanshit algorithm from sklearn library
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)

    # collect from the meanshift algorithm the labels and the centers of the clusters
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_


    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique) #Number of clusters

    # Print section
    print("The number of clusters is: %d" % n_clusters_)

    print("The centers are:")
    for i in range(n_clusters_):
        print i,
        print cluster_centers[i]

    return cluster_centers    
Example #23
0
def meanshift(raw_data, t):
   # Compute clustering with MeanShift
    # The following bandwidth can be automatically detected using
    #data = [ [(raw_data[i, 1]+raw_data[i, 5]), (raw_data[i, 2]+raw_data[i,6])] for i in range(raw_data.shape[0]) ]
    data = np.zeros((raw_data.shape[0],2))
    X = raw_data[:,1] + raw_data[:,5]
    Y = raw_data[:,2] + raw_data[:,6]
    #X = raw_data[:,1] ; Y = raw_data[:,2];
    data = np.transpose(np.concatenate((np.mat(X),np.mat(Y)), axis=0))
    bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(data)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print("number of estimated clusters : %d" % n_clusters_) 
    # Plot result
    plt.figure(t)
    plt.clf()
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        my_members = labels == k
        cluster_center = cluster_centers[k]
        plt.plot(data[my_members, 0], data[my_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=14)
    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.axis('equal')
    plt.show()    
Example #24
0
def mean_shift(X):
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=1000)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    return labels, cluster_centers
Example #25
0
def train(trainingData, pklFile, clusteringAll, numberOfClusters=None):
	# ========================================================================= #
	# =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= #
	# ========================================================================= #
	if (pklFile == ''):
		os.system('rm -rf learntModel & mkdir learntModel')
		pklFile = 'learntModel/learntModel.pkl'
	
	# ========================================================================= #
	# =============== STEP 2. PERFORM CLUSTERING TO THE DATA ================== #
	# ========================================================================= #
	if (numberOfClusters == None):
		print "Running MeanShift Model..."
		bandwidth = estimate_bandwidth(trainingData)
		ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=clusteringAll)
		ms.fit(trainingData)
		joblib.dump(ms, pklFile)
		return {"numberOfClusters":len(ms.cluster_centers_), "labels": ms.labels_, "clusterCenters":ms.cluster_centers_}
	
	elif (numberOfClusters != None):
		print "Running K-Means Model..."
		kMeans = KMeans(init='k-means++', n_clusters=numberOfClusters)
		kMeans.fit(trainingData)
		joblib.dump(kMeans, pklFile)
		return {"numberOfClusters":len(kMeans.cluster_centers_), "labels": kMeans.labels_, "clusterCenters":kMeans.cluster_centers_}
def weekhour(lst,day,hour,num):

    l = [ ]
    for dicts in lst:
        latlong = dicts["latlong"]
        l.append(latlong)
    l = np.array(l)
    l = np.array([x for x in l if x[0] < 40])
    l = np.array([x for x in l if x[1] < -102.0])
    l = np.array([x for x in l if x[0] > 39])
    l = np.array([x for x in l if x[1] > -105.5])

    bandwidth = .001
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(l)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)


    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        my_members = labels == k
        cluster_center = cluster_centers[k]
        plt.plot(l[my_members,1], l[my_members,0], col + '.')
        plt.plot(cluster_center[1], cluster_center[0], 'x', markerfacecolor=col,\
    markeredgecolor='k', markersize=14)

    num_samples = len(labels)
    list_clust_cents = cluster_centers.tolist()
    num_labels = Counter(labels).most_common()
    top = tuple(num_labels)

    if num > n_clusters_:
        num = n_clusters_

    for i in range(num):
        densest = top[i][1]
        percent = round((float(densest)/float(num_samples))*100,3)
        if densest >= 60:
            import geocoder
            g = geocoder.google(list_clust_cents[i], method='reverse')
            address = g.address
        else:
            address = 0

        with open('weekdayclusterstest.csv', 'a') as csvfile:
            fieldnames = ['day', 'hour', 'densest cluster', 'address', 'percent', 
                          'number of samples', 'number of estimated clusters']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerow({'densest cluster': densest, \
                             'day': day, \
                             'hour': hour, \
                             'address': address, \
                             'percent': percent, \
                             'number of samples': num_samples, \
                             'number of estimated clusters': n_clusters_})
Example #27
0
def meanshiftUsingPCA(path):
	# Load original image given the image path
	im = cv.LoadImageM(path)
	#convert image to YUV color space
	cv.CvtColor(im,im,cv.CV_BGR2YCrCb)
	# Load bank of filters
	filterBank = lmfilters.loadLMFilters()
	# Resize image to decrease dimensions during clustering
	resize_factor = 1
	thumbnail = cv.CreateMat(im.height / resize_factor, im.width / resize_factor, cv.CV_8UC3)
	cv.Resize(im, thumbnail)
	# now work with resized thumbnail image
	response = np.zeros(shape=((thumbnail.height)*(thumbnail.width),51), dtype=float)
	for f in xrange(0,48):
		filter = filterBank[f]
		# Resize the filter with the same factor for the resized image
		dst = cv.CreateImage(cv.GetSize(thumbnail), cv.IPL_DEPTH_32F, 3)
		resizedFilter = cv.CreateMat(filter.height / resize_factor, filter.width / resize_factor, filter.type)
		cv.Resize(filter, resizedFilter)
		# Apply the current filter
		cv.Filter2D(thumbnail,dst,resizedFilter)
		for j in xrange(0,thumbnail.height):
			for i in xrange(0,thumbnail.width):
				# Select the max. along the three channels
				maxRes = max(dst[j,i])
				if math.isnan(maxRes):
					maxRes = 0.0
				if maxRes > response[thumbnail.width*j+i,f]:
					# Store the max. response for the given feature index
					response[thumbnail.width*j+i,f] = maxRes

	#YUV features
	count = 0
	for j in xrange(0,thumbnail.height):
		for i in xrange(0,thumbnail.width):
			response[count,48] = thumbnail[j,i][0]
 			response[count,49] = thumbnail[j,i][1]
			response[count,50] = thumbnail[j,i][2]
            		count+=1

	#get the first 4 primary components using pca
	pca = PCA(response)
	pcaResponse = zeros([thumbnail.height*thumbnail.width,4])

	for i in xrange(0,thumbnail.height*thumbnail.width):
		pcaResponse[i] = pca.getPCA(response[i],4)

	# Create new mean shift instance
	ms = MeanShift(bandwidth=10,bin_seeding=True)
	# Apply the mean shift clustering algorithm
	ms.fit(pcaResponse)
	labels = ms.labels_
	n_clusters_ = np.unique(labels)
	print "Number of clusters: ", len(n_clusters_)
	repaintImage(thumbnail,labels)
	cv.Resize(thumbnail, im)
	return im
Example #28
0
def do_meanshift (band1, band2, band3, band4, colour1, colour2, make_plots):
    '''Does meanshift clustering to determine a number of clusters in the 
        data, which is passed to KMEANS function'''

    data = np.loadtxt(inputdata)

    #Input Checking
    #if band1 == band2 or band3 == band4: 
        #print "Not a good idea to use the same band in one colour, try again"
        #return
    #for band in [band1, band2, band3, band4]:
        #if band not in band_names.keys():
            #print "Can't find %s in band_name list" %band
            #return
        
    #Import 4 different wavelengths
    #Colour 1: 05_mag
    wave1 = data[:, band_names[band1]]
    wave2 = data[:, band_names[band2]]
    
    #Colour 2: 05_mag
    wave3 = data[:, band_names[band3]]
    wave4 = data[:, band_names[band4]]
    
    gooddata1 = np.logical_and(np.logical_and(wave1!=badval, wave2!=badval), np.logical_and(wave3!=badval, wave4!=badval)) # Remove data pieces with no value 
    gooddata2 = np.logical_and(np.logical_and(wave1<maglim, wave2<maglim), np.logical_and(wave3<maglim, wave4<maglim))
    greatdata = np.logical_and(gooddata1, gooddata2)
    
    colour1 = wave1[greatdata] - wave2[greatdata]
    colour2 = wave3[greatdata] - wave4[greatdata]
    
      
    #Truncate data
    X = np.vstack([colour1, colour2]).T

    #Scale data because meanshift generates circular clusters 
    X_scaled = preprocessing.scale(X)

    # The following bandwidth can be automatically detected using
    # the routine estimate_bandwidth(). Bandwidth can also be set
    # as a value.

    bandwidth = estimate_bandwidth(X)

    # Meanshift clustering 
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
    ms.fit(X_scaled)

    labels_unique = np.unique(ms.labels_)
    n_clusters = len(labels_unique[labels_unique >= 0])
    
    #Make plot of clusters if needed
    
    if "MSplot" in make_plot: 
        make_ms_plots(colour1, colour2, n_clusters, X, ms, band1, band2, band3, band4)
    
    return(n_clusters)
def meanShiftClustering(centers_df,subject):
    #estimate the bandwidth to use with the mean shift algorithm. The quantile represents the distance used between the box centers to define the cluster. Smaller quantile, means smaller distance between points that would end up in the same cluster
    centers_df=centers_df.reset_index()
    bandwidth=estimate_bandwidth(centers_df[['center_x','center_y']].as_matrix(), quantile=0.0055)
    #instantiate the mean shift algorithm
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    #fit the algorithm on the box center coordinates
    ms.fit(centers_df[['center_x','center_y']])
    #get the resulting clustesr labels
    labels = ms.labels_
    #get the resulting centers of each *cluster*
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    #calculate the number of clusters by using the length of the list that contains all the unique labels
    n_clusters_ = len(labels_unique)

    #concatenate the centers data frame (which contains all the box coordinates, their dimensions, and their centers) with the clustering labels generated by the clustering
    boxes_df = pd.concat([centers_df,pd.DataFrame(labels,columns=['cluster_label'])],axis=1)

    #the aggregate function in the groupby, includes two functions: count and median
    f = {'Number of boxes in a cluster': ['count'],'Median': ['median']}
    #group by the label of each cluster and aggregate the boxes' top left coordinates and dimensions by applying the median
    aggregated_df = boxes_df.groupby('cluster_label')['cluster_label','tl_x','tl_y','width','height'].agg(f).reset_index()
    #change column names for a more descriptive name
    aggregated_df.columns = ['cluster_label','median_cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster','count_tl_x','count_tl_y','count_width','count_height']
    #leave out the unnecessary columns
    aggregated_df = aggregated_df[['cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster']]
    
    #Look at the output of the plotBoxes function (svg file) and determine at which THRESHOLD value there is a desired number of clusters (appears at the top of the plot) and that it visually matches the actual grid
    THRESHOLD = 5

    #filter out all the clusters that have less than a certain number of boxes in each cluster
    #use the old-weather-aggregator-with-plot.py script to check what the best threshold is
    aggregated_df = aggregated_df.loc[aggregated_df.boxes_in_cluster>THRESHOLD,:]
    good_clusters = np.unique(aggregated_df.cluster_label.values)

    print "for subject_id:"+str(subject)

    print "number of estimated clusters overall: %d" % n_clusters_

    print "number of estimated clusters, after small clusters were filtered out: %d" % len(good_clusters)

    print "clusters with more than %d boxes per cluster:" % THRESHOLD
    print aggregated_df.columns
    print aggregated_df.head()

    #save the aggregated boxes and their clusters into a csv file, separate file for each subject
    print "Saving the output/aggregated_df_%s.csv file..." % str(subject)
    aggregated_df.to_csv("output/aggregated_df_"+str(subject)+".csv",index=False)

    #make sure that only the boxes that belong to the good_clusters (have more boxes than the threshhold) remain in the boxes_df dataframe and then save the dataframe
    boxes_df = boxes_df.loc[boxes_df['cluster_label'].isin(good_clusters),:]
    print "Saving the output/clustered_df_%s.csv file..." % str(subject)
    boxes_df.to_csv("output/clustered_df_"+str(subject)+".csv",index=False)

    plotBoxes(aggregated_df,boxes_df,cluster_centers)
Example #30
0
def checkForClustering(catalog):
    debug("Checking for data clustering")
    Xfull = catalog.view(np.float64).reshape(catalog.shape + (-1,))[:,1:]
    X = Xfull[:,2:]
    
    
    debug("Using DBSCAN")
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clusters_DBSCAN = len(set(labels)) - (1 if -1 in labels else 0)
    debug('Estimated number of clusters with DBSCAN: %d' % n_clusters_DBSCAN)
        
    unique_labelsDBSCAN = set(labels)
    colorsDBSCAN = plt.cm.rainbow(np.linspace(0, 1, len(unique_labelsDBSCAN)))
    
    debug("Estimating clusters using MeanShift")
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labelsMS = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_uniqueMS = np.unique(labelsMS)
    n_clusters_MS = len(labels_uniqueMS)
    debug("Estimated number of clusters with MeanShift: %d" % n_clusters_MS)
    
    # Plot result
    fig = plt.figure(figsize=(12,12))
    ax0 = fig.add_subplot(2,2,1)
    ax1 = fig.add_subplot(2,2,2)
    ax2 = fig.add_subplot(2,2,3)
    ax3 = fig.add_subplot(2,2,4)
    for k, col in zip(unique_labelsDBSCAN, colorsDBSCAN):
        if k == -1:
            col = 'k'
        class_member_mask = (labels == k)
        mask = class_member_mask & core_samples_mask
        xy = Xfull[mask]
        ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)
        ax2.plot(catalog['MAG_APER(1)'][mask], catalog['CLASS_STAR'][mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)
        xy = Xfull[class_member_mask & ~core_samples_mask]
        ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)
        ax2.plot(catalog['MAG_APER(1)'][class_member_mask & ~core_samples_mask], catalog['CLASS_STAR'][class_member_mask & ~core_samples_mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)

        ax0.set_title('DBCAN: # clusters: %d' % n_clusters_DBSCAN)
        
        
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_MS), colors):
        my_members = labelsMS == k
        cluster_center = cluster_centers[k]
        ax1.plot(Xfull[my_members, 0], Xfull[my_members, 1], col + '.')
        ax3.plot(catalog['MAG_APER(1)'][my_members], catalog['CLASS_STAR'][my_members], col + '.')
        #ax1.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
    ax1.set_title('MeanShift: # clusters: %d' % n_clusters_MS)
    plt.show()
Example #31
0
pred = {'semantics': [], 'instances': []}
with torch.no_grad():
    for i, batch in enumerate(tqdm(loader, ascii=True)):
        points = batch['points'].to(device)
        labels = batch['labels']
        size = batch['size']

        logits, embedded = model(points)
        logits = logits.cpu().numpy()
        semantics = np.argmax(logits, axis=-1)

        instances = []
        embedded = embedded.cpu().numpy()
        batch_size = embedded.shape[0]
        for b in range(batch_size):
            k = size[b].item()
            y = MeanShift(args['bandwidth'], n_jobs=8).fit_predict(embedded[b])
            instances.append(y)
        instances = np.stack(instances)

        pred['semantics'].append(semantics)
        pred['instances'].append(instances)

pred['semantics'] = np.concatenate(pred['semantics'], axis=0)
pred['instances'] = np.concatenate(pred['instances'], axis=0)

fname = os.path.join(logdir, 'pred.npz')
print('> Saving predictions to {}...'.format(fname))
np.savez(fname, **pred)
Example #32
0
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture

names = ["K-Means", "Affinity Propagation", "Spectral Clustering","Mean Shift","Agglomerative Clustering","DBSCAN","Birch"]

clusters = [
    KMeans(n_clusters=7, random_state=1),
    AffinityPropagation(damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False),
    SpectralClustering(n_clusters=7,
             assign_labels="discretize",
             random_state=1),
    MeanShift(bandwidth=2),
    AgglomerativeClustering(n_clusters=7, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', pooling_func='deprecated'),
    DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None),
    Birch(threshold=0.5, branching_factor=50, n_clusters=7, compute_labels=True, copy=True)
    ]

#read and create features & labels variables
data = pd.read_csv('glass_data_labeled.csv')
X = data[['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']]
y = data['Type']
# print(X)
# print(y)


for name, cl in zip(names, clusters):
    labels = cl.fit(X).labels_
Example #33
0
def ClusterScikit(model_file, train_file, valid_file, ftype, nsamples,
                  vectorizer, reducer, param):

    train_programs, train_features, train_classes = read_traces(
        train_file, nsamples)
    train_size = len(train_programs)
    print("using", train_size, "examples to train.")

    if vectorizer == "bow":

        train_dict = dict()
        train_dict[ftype] = train_features
        #batch_size = 16
        #window_size = 20

        print("Transforming data and fitting model..")
        model = make_cluster_pipeline_bow(ftype, reducer)
        X_red = model.fit_transform(train_dict)

    elif vectorizer == "doc2vec":

        from gensim.models.doc2vec import TaggedDocument
        from gensim.models import Doc2Vec

        print("Vectorizing traces..")
        sentences = []

        for (prog, trace) in zip(train_programs, train_features):
            sentences.append(TaggedDocument(trace.split(" "), [prog]))

        model = Doc2Vec(dm=2,
                        min_count=1,
                        window=5,
                        size=100,
                        sample=1e-4,
                        negative=5,
                        workers=8,
                        iter=1)
        model.build_vocab(sentences)

        for epoch in range(20):
            model.train(sentences)
            shuffle(sentences)

        train_dict = dict()

        vec_train_features = []
        for prog in train_programs:
            # print(prog, model.docvecs[prog])
            vec_train_features.append(model.docvecs[prog])

        train_dict[ftype] = vec_train_features

        print("Transforming data and fitting model..")
        model = make_cluster_pipeline_doc2vec(ftype, reducer)
        X_red = model.fit_transform(train_dict)

    #pl.rcParams.update({'font.size': 10})
    if isinstance(X_red, list):
        X_red = np.vstack(X_red)
        print(X_red.shape)

    if X_red.shape[1] == 2:

        plt.figure()
        colors = 'brgcmykbgrcmykbgrcmykbgrcmyk'
        ncolors = len(colors)

        for prog, [x, y], cl in zip(train_programs, X_red, train_classes):
            x = gauss(0, 0.1) + x
            y = gauss(0, 0.1) + y
            try:
                plt.scatter(x, y, c=colors[int(cl)])
                plt.text(x, y + 0.02, prog.split("/")[-1])
            except ValueError:
                plt.text(x, y + 0.02, cl)

        if valid_file is not None:
            valid_programs, valid_features, valid_classes = read_traces(
                valid_file, None)
            valid_dict = dict()
            valid_dict[ftype] = valid_features

            X_red = model.transform(valid_dict)
            for prog, [x, y], cl in zip(valid_programs, X_red, valid_classes):
                x = gauss(0, 0.1) + x
                y = gauss(0, 0.1) + y
                plt.scatter(x, y, c=colors[cl + 1])
                plt.text(x, y + 0.02, prog.split("/")[-1])

        # plt.show()
        plt.savefig(train_file.replace(".gz", "") + ".png")

    from sklearn.cluster import MeanShift, estimate_bandwidth

    bandwidth = estimate_bandwidth(X_red, quantile=0.2)
    print("Clustering with bandwidth:", bandwidth)

    af = MeanShift(bandwidth=bandwidth * param).fit(X_red)

    cluster_centers = af.cluster_centers_
    labels = af.labels_
    n_clusters_ = len(cluster_centers)

    if X_red.shape[1] == 2:

        plt.close('all')
        plt.figure(1)
        plt.clf()

        for ([x, y], label, cluster_label) in zip(X_red, train_programs,
                                                  labels):
            x = gauss(0, 0.1) + x
            y = gauss(0, 0.1) + y
            plt.scatter(x, y, c=colors[cluster_label % ncolors])

        for i, [x, y] in enumerate(cluster_centers):
            plt.plot(x,
                     y,
                     'o',
                     markerfacecolor=colors[i % ncolors],
                     markeredgecolor='k',
                     markersize=7)

        plt.title('Estimated number of clusters: %d' % n_clusters_)
        plt.savefig(train_file.replace(".gz", "") + ".clusters.png")

    # plt.show()

    clustered_traces = zip(train_programs, labels)
    writer = write_csv(train_file.replace(".gz", "") + ".clusters")
    for label, cluster in clustered_traces:
        writer.writerow([label.split("/")[-1], cluster])
Example #34
0
#Estimate bandwidth
#bandwidth increases by very less amount by increasing no. of samples and not much visible difference will be there
bandwidth1 = estimate_bandwidth(flat_image, quantile=.1, n_samples=500)
bandwidth2 = estimate_bandwidth(flat_image, quantile=.2, n_samples=500)
#bandwidth3 = estimate_bandwidth(flat_image, quantile=.1, n_samples=1000)
#bandwidth4 = estimate_bandwidth(flat_image, quantile=.3, n_samples=1000)
bandwidth3 = estimate_bandwidth(flat_image, quantile=.3, n_samples=500)
bandwidth4 = estimate_bandwidth(flat_image, quantile=.4, n_samples=500)

print(bandwidth1)
print(bandwidth2)
print(bandwidth3)
print(bandwidth4)

ms1 = MeanShift(bandwidth1, bin_seeding=True)
ms2 = MeanShift(bandwidth2, bin_seeding=True)
ms3 = MeanShift(bandwidth3, bin_seeding=True)
ms4 = MeanShift(bandwidth4, bin_seeding=True)
#print(ms1)

#Performing meanshift on flatImg
ms1.fit(flat_image)
ms2.fit(flat_image)
ms3.fit(flat_image)
ms4.fit(flat_image)

#(r,g,b) vectors corresponding to the different clusters after meanshift
labels1 = ms1.labels_
labels2 = ms2.labels_
labels3 = ms3.labels_
from sklearn.cluster import MeanShift, estimate_bandwidth
from scipy.fftpack import dct
import matplotlib.pyplot as plt
from PIL import Image

image = Image.open('C:\Users\Cris\Desktop\mountain_color.jpg')
image = np.array(image)

#Need to convert image into feature array based
#on rgb intensities
flat_image=np.reshape(image, [-1, 3])
 
#Estimate bandwidth
bandwidth2 = estimate_bandwidth(flat_image,
                                quantile=.2, n_samples=5000)
ms = MeanShift(bandwidth2, bin_seeding=True)
ms.fit(flat_image)
labels=ms.labels_

# Example of how to use discrete cosine transform. 
# We will apply it to luminance, rather than labels.
discrete_cosine_transform = dct(np.array(labels, dtype = 'float'))

np.savetxt('C:\Users\Cris\Desktop\labels.csv', labels, delimiter=',')
 
# Plot image vs segmented image

plt.figure(2)
plt.subplot(2, 1, 1)
plt.imshow(image)
plt.axis('off')
Example #36
0
#     "DBScan": DBSCAN(),
#     "OPTICS": OPTICS()
# }

# # fit_predict method for each algorithm - because DBScan and OPTICS doesn't have predict() method
# fit_predict = {
#     "k-Means": lambda clr, X_train, X_test: clr.fit(X_train).predict(X_test),
#     "MeanShift": lambda clr, X_train, X_test: clr.fit(X_train).predict(X_test),
#     "DBScan": lambda clr, _, X_test: clr.fit_predict(X_test),
#     "OPTICS": lambda clr, _, X_test: clr.fit_predict(X_test)
# }

# Algorithms
clrs = {
    "MyMeanShift": MyMeanShift(),
    "MeanShift": MeanShift(
    ),  # if not given, the bandwidth is estimated using sklearn.cluster.estimate_bandwidth
}

# fit_predict method for each algorithm - because DBScan and OPTICS doesn't have predict() method
fit_predict = {
    "MyMeanShift":
    lambda clr, X_train, X_test: clr.fit(X_train).predict(X_test),
    "MeanShift": lambda clr, X_train, X_test: clr.fit(X_train).predict(X_test),
}

# Measures
measures = {
    "adjusted_rand_score": adjusted_rand_score,
    "completeness_score": completeness_score,
    "homogeneity_score": homogeneity_score,
    "v_measure_score": v_measure_score,
Example #37
0
def test_meanshift_predict():
    # Test MeanShift.predict
    ms = MeanShift(bandwidth=1.2)
    labels = ms.fit_predict(X)
    labels2 = ms.predict(X)
    assert_array_equal(labels, labels2)
Example #38
0
def test_unfitted():
    # Non-regression: before fit, there should be not fitted attributes.
    ms = MeanShift()
    assert not hasattr(ms, "cluster_centers_")
    assert not hasattr(ms, "labels_")
def main():
    # generate theme
    sg.theme('DarkAmber')
    # All the stuff inside your window.
    layout = [[
        sg.Image(filename='', key='-frame-'),
        sg.Image(filename='', key='-model-')
    ], [sg.Button('Learn Model'), sg.Button('Close')]]
    # Create the Window
    window = sg.Window('SIFT model Learning GUI',
                       layout,
                       location=(800, 400),
                       finalize=True)
    # init frame acquisition
    cam = cv2.VideoCapture(0)
    print("Camera init -> DONE")
    # init feature detector SIFT
    SIFT = cv2.SIFT_create()
    # init feature matcher KNN
    matcher = cv2.DescriptorMatcher_create("BruteForce")
    ratio_thresh = 0.80
    # init state frame and model
    ret, scene_img_RGB = cam.read()
    model_img = np.zeros((480, 640, 3))
    # init model keypoint and descriptor
    kp_obj = None
    des_obj = None
    # Event Loop to process "events" and get the "values" of the inputs
    while True:
        # read state windows
        event, values = window.read(timeout=0, timeout_key='timeout')
        # get state
        ret, frame = cam.read()
        scene_img_RGB = frame
        # start / stop the application
        if event == 'Close' or event is None:
            # kill thread and close window
            cam.release()
            cv2.destroyAllWindows()
            # stop program
            break
        # Learn model
        if event == 'Learn Model' or event is None:
            # get object ROI
            roi = cv2.selectROI(scene_img_RGB)
            model_img = scene_img_RGB[int(roi[1]):int(roi[1] + roi[3]),
                                      int(roi[0]):int(roi[0] + roi[2])]
            cv2.destroyAllWindows()
            # find feature in the object ROI
            kp_obj, des_obj = SIFT.detectAndCompute(
                cv2.cvtColor(model_img, cv2.COLOR_BGR2GRAY), None)
            des_obj /= (des_obj.sum(axis=1, keepdims=True) + 1e-7)
            des_obj = np.sqrt(des_obj)
            # draw detected feature in the ROI
            model_img = cv2.drawKeypoints(
                model_img,
                kp_obj,
                None,
                flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
        # perform object detection in the current state
        if model_img is not None and kp_obj is not None and des_obj is not None:
            # convert frame in gray
            scene_img = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            # get feature in the scene
            kp_scene, des_scene = SIFT.detectAndCompute(scene_img, None)
            des_scene /= (des_scene.sum(axis=1, keepdims=True) + 1e-7)
            des_scene = np.sqrt(des_scene)
            # match feature with the template using KNN matching (norm L2)
            knn_matches = matcher.knnMatch(des_obj, des_scene, 2)
            # filter matches (lowe ratio)
            good_matches = []
            for m, n in knn_matches:
                if m.distance < ratio_thresh * n.distance:
                    good_matches.append(m)
            # create empty keypoint position vector for all good matches
            obj = np.empty((len(good_matches), 2), dtype=np.float32)
            scene = np.empty((len(good_matches), 2), dtype=np.float32)
            # update keypoints position
            for i in range(len(good_matches)):
                obj[i, 0] = kp_obj[good_matches[i].queryIdx].pt[0]
                obj[i, 1] = kp_obj[good_matches[i].queryIdx].pt[1]
                scene[i, 0] = kp_scene[good_matches[i].trainIdx].pt[0]
                scene[i, 1] = kp_scene[good_matches[i].trainIdx].pt[1]
            if scene.shape[0] > 10:
                # compute bandwith for the clustering
                bandwidth = estimate_bandwidth(scene, quantile=0.2)
                # compute clusters for keypoint
                meanShift = MeanShift(bandwidth=bandwidth, bin_seeding=True)
                meanShift.fit(scene)
                labels = meanShift.labels_
                clusterCenters = meanShift.cluster_centers_
                # compute pose using cluster label
                for c in range(len(clusterCenters)):
                    currentCluster = labels == c
                    objPoint = obj[currentCluster, :]
                    scenePoint = scene[currentCluster, :]
                    # if cluster point number superior to a threshold e=10
                    if scenePoint.shape[0] > 10:
                        # estimate homographical transformation
                        TF, mask = cv2.findHomography(objPoint, scenePoint,
                                                      cv2.RANSAC, 0.99)
                        if TF is not None and mask[mask == 1].size > 15:
                            # transform obj corners according the homographical transformation in the scene
                            h, w, c = model_img.shape
                            pts = np.float32([[0, 0], [0,
                                                       h - 1], [w - 1, h - 1],
                                              [w - 1, 0]]).reshape(-1, 1, 2)
                            dst = cv2.perspectiveTransform(pts, TF)
                            for i in range(scenePoint.shape[0]):
                                scene_img_RGB = cv2.circle(
                                    scene_img_RGB,
                                    (scenePoint[i, 0], scenePoint[i, 1]), 5,
                                    [0, 255, 255], -1)
                                scene_img_RGB = cv2.polylines(
                                    scene_img_RGB, [np.int32(dst)], True,
                                    (0, 255, 0), 20, cv2.LINE_AA)
        # update image on the GUI
        imgbytes_frame = cv2.imencode(
            '.png', cv2.resize(scene_img_RGB, (640, 480),
                               cv2.INTER_LINEAR))[1].tobytes()
        imgbytes_model = cv2.imencode('.png', model_img)[1].tobytes()
        window['-frame-'].update(data=imgbytes_frame)
        window['-model-'].update(data=imgbytes_model)

    window.close()
Example #40
0
input_file = 'sales.csv'
file_reader = csv.reader(open(input_file, 'r'), delimiter=',')
X = []
for count, row in enumerate(file_reader):
    if not count:
        names = row[1:]
        continue

    X.append([float(x) for x in row[1:]])

X = np.array(X)

bandwidth = estimate_bandwidth(X, quantile=0.8, n_samples=len(X))

meanshift_model = MeanShift(bandwidth=bandwidth, bin_seeding=True)
meanshift_model.fit(X)
labels = meanshift_model.labels_
cluster_centers = meanshift_model.cluster_centers_
num_clusters = len(np.unique(labels))

print("\nNumber of clusters in input data =", num_clusters)

print("\nCenters of clusters:")
print('\t'.join([name[:3] for name in names]))
for cluster_center in cluster_centers:
    print('\t'.join([str(int(x)) for x in cluster_center]))

cluster_centers_2d = cluster_centers[:, 1:3]

plt.figure()
Example #41
0
def get_cluster(bandwidth, X):
    """
    https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html

    Doc de MeanShift:

    Mean shift clustering using a flat kernel. Mean shift clustering aims to
    discover “blobs” in a smooth density of samples. It is a centroid-based
    algorithm, which works by updating candidates for centroids to be the mean
    of the points within a given region. These candidates are then filtered in
    a post-processing stage to eliminate near-duplicates to form the
    final set of centroids.

    Seeding is performed using a binning technique for scalability.

    Parameters

        bandwidth float, default=None

            Bandwidth used in the RBF kernel.

            If not given, the bandwidth is estimated using
            sklearn.cluster.estimate_bandwidth;
            see the documentation for that function for hints on scalability
            (see also the Notes, below).

        seeds array-like of shape (n_samples, n_features), default=None

            Seeds used to initialize kernels. If not set, the seeds are
            calculated by clustering.get_bin_seeds with bandwidth as the grid
            size and default values for other parameters.

        bin_seeding bool, default=False

            If true, initial kernel locations are not locations of all points,
            but rather the location of the discretized version of points, where
            points are binned onto a grid whose coarseness corresponds to the
            bandwidth. Setting this option to True will speed up the algorithm
            because fewer seeds will be initialized. The default value is False.
            Ignored if seeds argument is not None.

        min_bin_freq int, default=1

            To speed up the algorithm, accept only those bins with at least
            min_bin_freq points as seeds.

        cluster_all bool, default=True

            If true, then all points are clustered, even those orphans that are
            not within any kernel. Orphans are assigned to the nearest kernel.
            If false, then orphans are given cluster label -1.

        n_jobs int, default=None

            The number of jobs to use for the computation. This works by
            computing each of the n_init runs in parallel.

            None means 1 unless in a joblib.parallel_backend context. -1 means
            using all processors. See Glossary for more details.

        max_iter int, default=300

            Maximum number of iterations, per seed point before the clustering
            operation terminates (for that seed point), if has not converged yet.
    """

    ms = MeanShift( bandwidth=bandwidth,
                    bin_seeding=True,
                    n_jobs=-1,
                    max_iter=500)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print("number of estimated clusters : %d" % n_clusters_)
    return labels, cluster_centers, labels_unique, n_clusters_
Example #42
0
print(label_list)
print(word_list)
#K-means算法
km = KMeans(n_clusters=89, max_iter=300, n_init=40, init='k-means++', n_jobs=1)
result_kmeans = km.fit_predict(tfidf.toarray())
#AffinityPropagation算法
ap = AffinityPropagation(damping=0.55,
                         max_iter=575,
                         convergence_iter=575,
                         copy=True,
                         preference=None,
                         affinity='euclidean',
                         verbose=False)
result_ap = ap.fit_predict(tfidf.toarray())
#meanshift算法
ms = MeanShift(bandwidth=0.65, bin_seeding=True)
result_ms = ms.fit_predict(tfidf.toarray())
#SpectralClustering算法
sc = SpectralClustering(n_clusters=89,
                        affinity='nearest_neighbors',
                        n_neighbors=4,
                        eigen_solver='arpack',
                        n_jobs=1)
result_sc = sc.fit_predict(tfidf.toarray())
#DBSCAN算法
db = DBSCAN(eps=0.7, min_samples=1)
result_db = db.fit_predict(tfidf.toarray())
#AgglomerativeClustering算法
ac = AgglomerativeClustering(n_clusters=89,
                             affinity='euclidean',
                             linkage='ward')
def Windows_KDE_amova(SequenceStore, admx_lib, refs_lib):

    Geneo = admx_lib

    Geneo_order = list(Geneo.keys())
    ref_order = list(refs_lib.keys())

    Whose = [z for z in it.chain(*[Geneo[x] for x in Geneo_order])]
    Sup_labels = list(
        np.repeat(Geneo_order, [len(Geneo[x]) for x in Geneo_order]))

    ### Define parameters and libraries of analyses.

    Results = {x: recursively_default_dict() for x in SequenceStore.keys()}

    Construct = recursively_default_dict()
    PC_var = recursively_default_dict()

    for CHR in SequenceStore.keys():
        print('going on CHR: ' + str(CHR))
        for c in SequenceStore[CHR].keys():

            ### PCA and MeanShift of information from each window copied from *FM36_Galaxy.py.
            Sequences = [SequenceStore[CHR][c][x] for x in Whose]
            Sequences = np.array(Sequences)

            Sequences = np.nan_to_num(Sequences)

            pca = PCA(n_components=KDE_comps,
                      whiten=False,
                      svd_solver='randomized').fit(Sequences)
            data = pca.transform(Sequences)
            PC_var[CHR][c] = [x for x in pca.explained_variance_]

            params = {
                'bandwidth':
                np.linspace(np.min(data), np.max(data), Bandwidth_split)
            }
            grid = GridSearchCV(KernelDensity(algorithm="ball_tree",
                                              breadth_first=False),
                                params,
                                verbose=0)

            ######################################
            ####### TEST global Likelihood #######
            ######################################
            Focus_labels = [z for z in it.chain(*refs_lib.values())]

            #### Mean Shift approach
            ## from sklearn.cluster import MeanShift, estimate_bandwidth

            bandwidth = estimate_bandwidth(data,
                                           quantile=0.2,
                                           n_samples=len(Focus_labels))
            if bandwidth <= 1e-3:
                bandwidth = 0.1

            ms = MeanShift(bandwidth=bandwidth,
                           cluster_all=False,
                           min_bin_freq=clsize)
            ms.fit(data[Focus_labels, :])
            labels = ms.labels_

            Tree = {
                x: [
                    Focus_labels[y] for y in range(len(labels))
                    if labels[y] == x
                ]
                for x in [g for g in list(set(labels)) if g != -1]
            }
            Keep = [x for x in Tree.keys() if len(Tree[x]) > clsize]

            Tree = {x: Tree[x] for x in Keep}
            Ngps = len(Tree)
            SpaceX = {x: data[Tree[x], :] for x in Tree.keys()}

            ### Extract MScluster likelihood by sample

            for hill in SpaceX.keys():

                grid.fit(data[Tree[hill], :])

                # use the best estimator to compute the kernel density estimate
                kde = grid.best_estimator_

                # normalize kde derived log-likelihoods, derive sample p-values
                P_dist = kde.score_samples(data[Tree[hill], :])
                Dist = kde.score_samples(data)
                P_dist = np.nan_to_num(P_dist)
                Dist = np.nan_to_num(Dist)
                if np.std(P_dist) == 0:
                    Dist = np.array(
                        [int(Dist[x] in P_dist) for x in range(len(Dist))])
                else:
                    Dist = scipy.stats.norm(np.mean(P_dist),
                                            np.std(P_dist)).cdf(Dist)
                Dist = np.nan_to_num(Dist)
                Construct[CHR][c][hill] = Dist

                #########################################
            ############# AMOVA ################
            #########################################
            if supervised:
                labels = Sup_labels
                Who = list(range(Sequences.shape[0]))
                Ngps = len(refs_lib)

            else:
                Who = [
                    x for x in range(len(labels))
                    if labels[x] != -1 and labels[x] in Keep
                ]
                labels = [labels[x] for x in Who]
                Who = [Focus_labels[x] for x in Who]

            if amova:
                clear_output()
                Bool_set = Sequences[Who, :].astype(bool)
                print(
                    'chr {}, where: {}, supervised: {}, n clusters: {}'.format(
                        CHR, c, str(supervised), Ngps))
                Amova1, Ciggy = AMOVA_FM42(Bool_set,
                                           labels,
                                           n_boot=0,
                                           metric='jaccard')
                Amova2, Ciggy = AMOVA_FM42(data[Who, :],
                                           labels,
                                           n_boot=0,
                                           metric='euclidean')
                Amova3, Ciggy = AMOVA_FM42(Bool_set,
                                           labels,
                                           n_boot=0,
                                           metric='hamming')
                print('old: ; jaccard: {}; PCA euc: {}; nHam: {}'.format(
                    Amova1, Amova2, Amova3))
                Results[CHR][c] = [Ngps, Amova1, Amova2, Amova3]

    return Results, Construct, PC_var
Example #44
0
def break_down_spec(actual_tracks,
                    N_neigh=0,
                    ms_layer2=True,
                    scale_spec=False,
                    qtl_I=0.05,
                    qtl_II=0.1,
                    clst_all_I=True,
                    clst_all_II=True):
    ###

    if scale_spec:
        samps_tracks = scale(actual_tracks, axis=0)
    else:
        samps_tracks = actual_tracks

    # #############################################################################

    bandwidth = estimate_bandwidth(samps_tracks,
                                   quantile=qtl_I,
                                   n_samples=samps_tracks.shape[0])
    ms = MeanShift(bandwidth=bandwidth,
                   bin_seeding=True,
                   cluster_all=clst_all_I).fit(samps_tracks)

    labels = ms.labels_
    coords = {
        z: [x for x in range(len(labels)) if labels[x] == z]
        for z in list(set(labels)) if z != -1
    }

    names_plots = ['MS1']

    ####

    fig = [
        go.Scatter(x=[actual_tracks[x, 0] for x in coords[i]],
                   y=[actual_tracks[x, 1] for x in coords[i]],
                   mode='markers',
                   name=str(i),
                   marker=dict(color=i)) for i in coords.keys()
    ]

    layout = go.Layout(title='MS clust. I',
                       xaxis=dict(title='time (s)'),
                       yaxis=dict(title='frequency'))

    figures = [go.Figure(data=fig, layout=layout)]
    ####

    ## an extra step to clean this up.
    if ms_layer2:
        extra_cls = {}

        for clust in coords.keys():
            subset = samps_tracks[coords[clust], :]
            subset = scale(subset, axis=0)

            if subset.shape[0] > 10:

                bandwidth = estimate_bandwidth(subset,
                                               quantile=qtl_II,
                                               n_samples=subset.shape[0])
                if bandwidth > 0:
                    ms = MeanShift(bin_seeding=True,
                                   cluster_all=clst_all_II,
                                   bandwidth=bandwidth).fit(subset)

                    labels_local = ms.labels_
                    coords_local = {
                        z: [
                            coords[clust][x] for x in range(len(labels_local))
                            if labels_local[x] == z
                        ]
                        for z in list(set(labels_local)) if z != -1
                    }
                    coords_local = {
                        z: coords_local[z]
                        for z in coords_local.keys()
                        if len(coords_local[z]) > 3
                    }
                    coords_keys = sorted(coords_local.keys())

                    if len(coords_keys) > 1:
                        coords[clust] = coords_local[coords_keys[0]]
                        for cl in coords_keys[1:]:
                            extra_cls[len(extra_cls) +
                                      len(coords)] = coords_local[cl]

        coords.update(extra_cls)
        names_plots.append('MSII')

    ########################
    ## get just neighbours
    ##
    if N_neigh:
        extra_cls = {}

        for clust in coords.keys():
            subset = samps_tracks[coords[clust], :]

            if subset.shape[0] >= 2 * N_neigh:

                t = list(np.arange(0, len(coords[clust]), N_neigh))
                coords_local = list(sorted(coords[clust]))
                if len(t) > 1:
                    if len(coords_local) - t[-1] < N_neigh:
                        t[-1] = len(coords_local)
                    else:
                        t.append(len(coords_local))
                    coords[clust] = coords_local[t[0]:t[1]]
                    for cl in range(1, len(t) - 1):
                        extra_cls[len(extra_cls) +
                                  len(coords)] = coords_local[t[cl]:t[cl + 1]]

        coords.update(extra_cls)
        names_plots[-1] = names_plots[-1] + '_neighs{}'.format(N_neigh)

    fig = [
        go.Scatter(x=[actual_tracks[x, 0] for x in coords[i]],
                   y=[actual_tracks[x, 1] for x in coords[i]],
                   mode='markers',
                   name=str(i),
                   marker=dict(color=i)) for i in coords.keys()
    ]

    layout = go.Layout(title='MS clust. II',
                       xaxis=dict(title='time (s)'),
                       yaxis=dict(title='frequency'))

    figures.append(go.Figure(data=fig, layout=layout))

    return coords, figures, names_plots
Example #45
0
# Z = linkage(face_encodings, 'ward')
# fig = plt.figure(figsize=(25, 10))
# dn = dendrogram(Z)

#mean-shift
if True:
    nuke_people()
    faces = list(Face.objects.all())
    face_encodings = np.array(
        [np.frombuffer(bytes.fromhex(f.encoding)) for f in faces])
    X = StandardScaler().fit_transform(face_encodings)

    bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=500)

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)

#DBSCAN
if False:
    nuke_people()
    faces = list(Face.objects.all())
    face_encodings = np.array(
        [np.frombuffer(bytes.fromhex(f.encoding)) for f in faces])
    X = StandardScaler().fit_transform(face_encodings)

    # #############################################################################
    # Compute DBSCAN
    db = DBSCAN(eps=5, min_samples=2).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
Example #46
0
def frame_peaks(array_spec,
                spec_fs,
                spec_ts,
                frame=0,
                Sample_N=500,
                p_threshold=0.0004,
                amp_cap=4,
                peak_cap=.7,
                peak_iso=200,
                band_qtl=0.02,
                frame_plot=False):

    ## get probs from
    probs = list(array_spec[:, frame])
    probs = np.array(probs)
    probs[probs > amp_cap] = amp_cap

    prob_sum = np.sum(probs)
    probs = probs / prob_sum

    # #############################################################################
    # Compute clustering with MeanShift

    # The following bandwidth can be automatically detected using
    new_freqs = np.random.choice(list(spec_fs),
                                 Sample_N,
                                 replace=True,
                                 p=probs)

    new_freqs = new_freqs.reshape(-1, 1)

    bandwidth = estimate_bandwidth(new_freqs,
                                   quantile=band_qtl,
                                   n_samples=Sample_N)

    if bandwidth == 0:
        bandwidth = peak_iso

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True,
                   cluster_all=False).fit(new_freqs)

    labels = ms.labels_
    cluster_centers = ms.cluster_centers_

    cluster_centers = list(it.chain(*cluster_centers))

    ## trim_clusters:
    ### interpolation makes it easier to chose between neibhour centroids
    ### that are unlikely to exist as obs. frequency values.
    from scipy.interpolate import interp1d
    f2 = interp1d(spec_fs, array_spec[:, frame], kind='cubic')

    cluster_centers = cluster_threshold(cluster_centers, peak_iso, f2)
    cluster_centers = cluster_threshold(cluster_centers, peak_iso, f2)

    ####
    #### get amplitudes of peaks and store them

    peak_cent = []
    amps_centres = []

    shapes = []
    for cent in cluster_centers:

        closest = abs(spec_fs - cent)

        closet = np.argmin(closest)

        amp_sel = array_spec[closet, frame]

        if amp_sel >= peak_cap:
            peak_cent.append(cent)
            amps_centres.append(amp_sel)

    ## get time stamps for each of the peaks.
    time_spec = [spec_ts[frame]] * len(amps_centres)

    if frame_plot:

        kde = KernelDensity(kernel='gaussian',
                            bandwidth=bandwidth).fit(new_freqs)
        X_plot = np.linspace(0, max(spec_fs) + 100, 1000)[:, np.newaxis]
        log_dens = kde.score_samples(X_plot)

        fig = [go.Scatter(x=spec_fs, y=array_spec[:, frame], mode='lines')]

        #fig= [go.Scatter(x=X_plot[:, 0], y=np.exp(log_dens), mode='lines', fill='tozeroy', line=dict(color='#AAAAFF', width=2))]

        shapes = []

        for center in peak_cent:

            shapes.append({
                'type': 'line',
                'x0': center,
                'y0': 0,
                'x1': center,
                'y1': max(array_spec[:, frame]),
                'line': {
                    'color': 'red',
                    'width': 4,
                    'dash': 'solid'
                },
            })

        layout = go.Layout(title='frame inx: {}'.format(frame),
                           shapes=shapes,
                           xaxis=dict(title='frequency'),
                           yaxis=dict(title='amplitude'))

        figure_frame = go.Figure(data=fig, layout=layout)

        return peak_cent, time_spec, amps_centres, figure_frame

    else:
        return peak_cent, time_spec, amps_centres
Example #47
0
## This is getting real data

# Date needs to be in the format of column name on the first row of the column and numeric data in the rest of the fields.
# Pulls in all the data in the file, the col_int above is then used to decide what to actually look at
X = pd.read_csv(myPath + Data_in,
                header=0,
                dtype={
                    0: np.float64,
                    1: np.float64
                })

print(X.head())

## This is the bit where it fits the data

ms = MeanShift(cluster_all=False)

# Convert the columns of interest to a NumPy array
# Multi-dimensional so could be anything really
msX = np.array(X.iloc[:, col_int])

# print (msX)

ms.fit(msX)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

n_clusters_ = len(np.unique(labels))

# print("Number of estimated clusters:", n_clusters_)
# print(labels)
Example #48
0
from sklearn.datasets.samples_generator import make_blobs

mmmm = 0

for filename in glob.glob('Q2-images/*'):  #assuming gif
    X1 = []

    I = cv2.imread(filename)
    I = cv2.resize(I, (0, 0), fx=0.5, fy=0.5)

    for i in range(I.shape[0]):
        for j in range(I.shape[1]):
            X1.append(I[i][j][:].tolist())

    bandwidth = estimate_bandwidth(X1, quantile=0.25, n_samples=10)
    clustering = MeanShift(bandwidth=2, bin_seeding=True).fit(X1)

    labels = clustering.labels_
    C = clustering.cluster_centers_
    X1 = np.array(X1)

    labels = np.array(labels)

    count = 0
    I_out = np.zeros(I.shape)
    for i in range(I.shape[0]):
        for j in range(I.shape[1]):

            I_out[i][j][:] = C[labels[count]]
            count = count + 1
Example #49
0
    def do_work(self, train, uid, url):
        self.cap = cv2.VideoCapture(url)
        print(uid)

        self.kernel = np.ones((3, 3), np.uint8)

        self.frameWidth = int(self.cap.get(3))
        self.frameHeight = int(self.cap.get(4))

        self.outOriginal = cv2.VideoWriter(
            'cache/original.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'),
            24, (self.frameWidth, self.frameHeight))
        self.outDetect = cv2.VideoWriter(
            'cache/detect.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'), 24,
            (self.frameWidth, self.frameHeight))
        self.outSkel = cv2.VideoWriter(
            'cache/skel.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'), 24,
            (self.frameWidth, self.frameHeight))

        self.fgbg = cv2.bgsegm.createBackgroundSubtractorMOG()

        self.frameCount = 0

        cacheDir = os.path.join(os.getcwd(), 'cache')
        sourceDir = os.path.join(os.getcwd(), 'sources')
        try:
            pass
            os.remove(os.path.abspath(os.path.join(cacheDir, 'test.csv')))
        except OSError as e:
            pass
        try:
            if train:
                os.remove(
                    os.path.abspath(os.path.join(sourceDir,
                                                 str(uid) + '.csv')))
        except OSError as e:
            pass

        while self.frameCount < 240:

            status, frame = self.cap.read()

            if not status:
                break

            blur = cv2.GaussianBlur(frame, (9, 9), 0)
            fgmask = self.fgbg.apply(blur)

            img = cv2.dilate(fgmask, self.kernel, iterations=1)

            x, y, height, length = self.contourDetect(img)
            boxImg = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
            boxImg = cv2.rectangle(boxImg, (x, y), (x + length, y + height),
                                   (0, 0, 255), 2)
            cv2.line(boxImg, (0, int(y + 0.75 * height)),
                     (640, int(y + 0.75 * height)), (0, 255, 0), 2)
            cv2.line(boxImg, (0, int(y + 0.15 * height)),
                     (640, int(y + 0.15 * height)), (255, 0, 0), 2)

            skel, hip, shoulder = self.skelRegion(img, x, y, height, length)

            if self.frameCount > 50 and self.frameCount < 151:
                if train:
                    with open('sources/' + str(uid) + '.csv', 'a',
                              newline='') as csvfile:
                        with open('cache/target.csv', 'a',
                                  newline='') as targetfile:
                            fieldnames = [
                                'height', 'stride', 'lowerbody', 'upperbody',
                                'hipangle', 'shoulderx', 'shouldery'
                            ]
                            writer = csv.DictWriter(csvfile,
                                                    fieldnames=fieldnames)

                            targetnames = ['class']
                            targetWriter = csv.DictWriter(
                                targetfile, fieldnames=targetnames)

                            writer.writerow({
                                'height':
                                height,
                                'stride':
                                length,
                                'lowerbody':
                                round(0.53 * height, 2),
                                'upperbody':
                                round(0.4 * height, 2),
                                'hipangle':
                                round(hip, 2),
                                'shoulderx':
                                shoulder[0],
                                'shouldery':
                                shoulder[1]
                            })

                            targetWriter.writerow({'class': uid})
                            targetWriter.writerow({'class': 0})
                else:
                    with open('cache/test.csv', 'a', newline='') as csvfile:
                        fieldnames = [
                            'height', 'stride', 'lowerbody', 'upperbody',
                            'hipangle', 'shoulderx', 'shouldery'
                        ]
                        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                        writer.writerow({
                            'height': height,
                            'stride': length,
                            'lowerbody': round(0.53 * height, 2),
                            'upperbody': round(0.4 * height, 2),
                            'hipangle': round(hip, 2),
                            'shoulderx': shoulder[0],
                            'shouldery': shoulder[1]
                        })

            self.outOriginal.write(frame)

            self.outDetect.write(boxImg)
            self.outSkel.write(skel)

            self.frameCount += 1

            if self.frameCount % 10 == 0:
                if train:
                    self.trackProgress(self.frameCount / 240, True)
                else:
                    self.trackProgress(self.frameCount / 240, False)

        print("processing done!")
        self.cap.release()
        self.outDetect.release()
        self.outOriginal.release()
        self.outSkel.release()

        verify = False

        if train:
            pass
        else:
            csv_files = glob.glob('sources/*.csv')
            for cfile in csv_files:
                cf = pd.read_csv(cfile)
                master_array = cf.as_matrix()

                df = pd.read_csv('cache/test.csv')
                numpy_array = df.as_matrix()
                print(numpy_array)

                bandwidth = estimate_bandwidth(master_array, quantile=0.1)
                ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
                ms.fit(master_array)
                master_labels = ms.labels_
                master_centers = ms.cluster_centers_
                print("Master centroids:\n", master_centers)
                print("Number of Master clusters: ",
                      len(np.unique(master_labels)))

                bandwidth = estimate_bandwidth(numpy_array, quantile=0.1)
                ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
                ms.fit(numpy_array)
                labels = ms.labels_
                cluster_centers = ms.cluster_centers_
                print("Test centroids:\n", cluster_centers)
                print("Number of Test clusters: ", len(np.unique(labels)))

                bandwidth = estimate_bandwidth(master_centers, quantile=0.9)
                ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
                ms.fit(master_centers)
                master_centers = ms.cluster_centers_

                bandwidth = estimate_bandwidth(cluster_centers, quantile=0.9)
                ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
                ms.fit(cluster_centers)
                cluster_centers = ms.cluster_centers_

                # new_centers = np.concatenate((master_centers, cluster_centers))
                LIMIT = np.matrix([[5, 5, 5, 5, 5, 5, 5]])
                if abs(master_centers - cluster_centers).all() < LIMIT.all():
                    verify = True
                    uid = cfile.split('.')[0].split('/')[1]
                    data = self.fetchDatabase(uid)
                    img = open('cache/image.png', 'wb')
                    img.write(data[4])
                    img.close()
                    self.verifyDone.emit(str(data[0]), data[1], data[2],
                                         str(data[3]))
                    print(master_centers)
                    print(cluster_centers)

                    break

                print(master_centers)
                print(cluster_centers)

        if not verify and not train:
            self.unauthVerify.emit()
        if train:
            self.threadCompleted.emit(True)
        else:
            self.threadCompleted.emit(False)
setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1]

# virginica.plot.scatter(x=0, y=1, c='r')
# versicolor.plot.scatter(x=0, y=1, c='b')
# setosa.plot.scatter(x=0, y=1, c='g')

plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r')
plt.scatter(x=versicolor['Sepal length'],
            y=versicolor['Sepal width'],
            color='g')
plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b')
# plt.show()

# 4.
print(estimate_bandwidth(iris_data, quantile=0.2))
analyzer = MeanShift(bandwidth=1)
print('Self MeanShift: ', analyzer.fit(iris_data))
print('Function mean_shift: ', mean_shift(iris_data))

# print(estimate_bandwidth(virginica, quantile=0.2))
# print(estimate_bandwidth(versicolor, quantile=0.2))
# print(estimate_bandwidth(setosa, quantile=0.2))

# 5.

# labels, cluster_centers, n_clusters = mean_shift(data_2d)

# colors = cycle('bgrcmy')
# for k, col in zip(range(n_clusters), colors):
#     my_members = (labels == k)
#     cluster_center = cluster_centers[k]
import numpy as np
from sklearn.cluster import MeanShift
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
Axes3D = Axes3D
from matplotlib import style
style.use("ggplot")

centers = [[1, 1, 1], [5, 5, 5], [3, 10, 10]]

X, _ = make_blobs(n_samples=100, centers=centers, cluster_std=1.5)

ms = MeanShift()
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

print(cluster_centers)
n_clusters_ = len(np.unique(labels))
print("Number of estimated clusters:", n_clusters_)

colors = 10 * ['r', 'g', 'b', 'c', 'k', 'y', 'm']
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1, projection='3d')

for i in range(len(X)):
    ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o')

ax.scatter(cluster_centers[:, 0],
           cluster_centers[:, 1],
Example #52
0
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

onehotencoder = OneHotEncoder(categorical_features = [9])
x = onehotencoder.fit_transform(x).toarray()

onehotencoder = OneHotEncoder(categorical_features = [17])
x = onehotencoder.fit_transform(x).toarray()
"""
############################################ClusterModeling###########################################################

# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import MeanShift

ms = MeanShift(bandwidth=2,
               bin_seeding=False,
               cluster_all=True,
               min_bin_freq=2,
               seeds=None)
ms_predict = ms.fit_predict(x)
print(ms.labels_)
from sklearn import metrics

print(ms_predict)
from sklearn.metrics import pairwise_distances
#print(metrics.silhouette_score(x, brc_predict, metric='euclidean'))
print("Silhouette Score: %0.3f" %
      metrics.silhouette_score(x, ms_predict, metric='euclidean'))
print("Calinski-Harabaz Index: %0.3f" %
      metrics.calinski_harabaz_score(x, ms_predict))

############################################SavingInXlsx###########################################################
Example #53
0
def ClusterCnn(model_file, train_file, valid_file, ftype, nsamples, outdir):

    f = open(model_file + ".pre")
    preprocessor = pickle.load(f)

    import h5py
    f = h5py.File(model_file + ".wei")

    layers = []
    for k in range(f.attrs['nb_layers']):
        g = f['layer_{}'.format(k)]
        layers.append(
            [g['param_{}'.format(p)] for p in range(g.attrs['nb_params'])])

    max_features = len(preprocessor.tokenizer.word_counts)

    print("Reading and sampling data to train..")
    train_programs, train_features, train_classes = read_traces(train_file,
                                                                nsamples,
                                                                cut=None)
    train_size = len(train_features)

    #y = train_programs
    X_train, y_train, labels = preprocessor.preprocess_traces(
        train_features, y_data=train_classes, labels=train_programs)
    new_model = make_cluster_cnn("test",
                                 max_features,
                                 maxlen,
                                 embedding_dims,
                                 nb_filters,
                                 filter_length,
                                 hidden_dims,
                                 None,
                                 weights=layers)

    train_dict = dict()
    train_dict[ftype] = new_model.predict(X_train)

    model = make_cluster_pipeline_subtraces(ftype)
    X_red_comp = model.fit_transform(train_dict)
    explained_var = np.var(X_red_comp, axis=0)
    print(explained_var)

    X_red = X_red_comp[:, 0:2]
    X_red_next = X_red_comp[:, 2:4]

    colors = mpl.colors.cnames.keys()
    progs = list(set(labels))
    ncolors = len(colors)
    size = len(labels)
    print("Plotting..")

    for prog, [x, y] in zip(labels, X_red):
        # for prog,[x,y] in sample(zip(labels, X_red), min(size, 1000)):
        x = gauss(0, 0.05) + x
        y = gauss(0, 0.05) + y
        color = 'r'
        plt.scatter(x, y, c=color)
    """
  if valid_file is not None:
    valid_programs, valid_features, valid_classes = read_traces(valid_file, None, cut=None, maxsize=window_size) #None)
    valid_dict = dict()

    X_valid, _, valid_labels = preprocessor.preprocess_traces(valid_features, y_data=None, labels=valid_programs)
    valid_dict[ftype] = new_model.predict(X_valid)
    X_red_valid_comp = model.transform(valid_dict)

    X_red_valid = X_red_valid_comp[:,0:2]
    X_red_valid_next = X_red_valid_comp[:,2:4]

    for prog,[x,y] in zip(valid_labels, X_red_valid):
      x = gauss(0,0.05) + x
      y = gauss(0,0.05) + y
      plt.scatter(x, y, c='b')
      plt.text(x, y+0.02, prog.split("/")[-1])

  plt.show()
  """
    plt.savefig(train_file.replace(".gz", "") + ".png")
    print("Bandwidth estimation..")
    from sklearn.cluster import MeanShift, estimate_bandwidth

    X_red_sample = X_red[:min(size, 1000)]
    bandwidth = estimate_bandwidth(X_red_sample, quantile=0.2)
    print("Clustering with bandwidth:", bandwidth)

    #X_red = np.vstack((X_red,X_red_valid))
    #X_red_next = np.vstack((X_red_next,X_red_valid_next))
    #labels = labels + valid_labels

    print(X_red.shape, len(X_red), len(labels))
    # print(valid_labels)

    af = MeanShift(bandwidth=bandwidth / 1).fit(X_red)

    cluster_centers = af.cluster_centers_
    cluster_labels = af.labels_
    n_clusters = len(cluster_centers)

    plt.figure()
    for ([x, y], label, cluster_label) in zip(X_red, labels, cluster_labels):
        # for ([x,y],label, cluster_label) in sample(zip(X_red,labels,
        # cluster_labels), min(size, 1000)):
        x = gauss(0, 0.1) + x
        y = gauss(0, 0.1) + y
        plt.scatter(x, y, c=colors[cluster_label % ncolors])
        # if label in valid_labels:
        #  plt.text(x-0.05, y+0.01, label.split("/")[-1])

    for i, [x, y] in enumerate(cluster_centers):
        plt.plot(x,
                 y,
                 'o',
                 markerfacecolor=colors[i % ncolors],
                 markeredgecolor='k',
                 markersize=7)
    """
  #for prog,[x,y] in zip(valid_labels, X_red_valid):
    #x = gauss(0,0.1) + x
    #y = gauss(0,0.1) + y
    #plt.scatter(x, y, c='black')
    #plt.text(x, y+0.02, prog.split("/")[-1])


  plt.title('Estimated number of clusters: %d' % n_clusters)

  #plt.savefig("clusters.png")
  plt.show()
  """
    plt.savefig(train_file.replace(".gz", "") + ".clusters.png")

    clustered_traces = zip(labels, cluster_labels)
    writer = open_csv(train_file.replace(".gz", "") + ".clusters")
    for label, cluster in clustered_traces:
        writer.writerow([label, cluster])
    """
Example #54
0
 def generate(self, themes=None):
     self._pack()
     if themes:
         return KMeans(n_clusters=themes).fit(
             self._histograms[0]).cluster_centers_
     return MeanShift().fit(self._histograms[0]).cluster_centers_
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.cluster import MeanShift

iris_data = pd.read_excel('iris_data.xlsx')
print(iris_data.head())

iris_data = pd.get_dummies(iris_data, columns=['Species'])
print(iris_data.head())

virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1]
versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1]
setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1]

plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r')
plt.scatter(x=versicolor['Sepal length'],
            y=versicolor['Sepal width'],
            color='g')
plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b')
#plt.show()

from sklearn.cluster import estimate_bandwidth
print(estimate_bandwidth(virginica, quantile=0.2))
print(estimate_bandwidth(versicolor, quantile=0.2))
print(estimate_bandwidth(setosa, quantile=0.2))
print(estimate_bandwidth(iris_data, quantile=1))
analyzer = MeanShift(bandwidth=1)
print(analyzer.fit(iris_data))
vectorizer = TfidfVectorizer(max_df=0.8,
                             max_features=200000,
                             min_df=0.2,
                             stop_words=stopwords,
                             use_idf=True,
                             tokenizer=tokenize_and_stem,
                             ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(clean_data)
print(tfidf_matrix.shape)
dense_text = tfidf_matrix.todense()

# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(dense_text, quantile=0.2, n_samples=1000)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(dense_text)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
Example #57
0
X = []
for count, row in enumerate(file_reader):
    if not count:
        names = row[2:]
        continue

    X.append([float(x) for x in row[2:]])

# Input data as numpy array
X = np.array(X)

# Estimating the bandwidth 
bandwidth = estimate_bandwidth(X, quantile=0.8, n_samples=len(X))

# Compute clustering with MeanShift
meanshift_estimator = MeanShift(bandwidth=bandwidth, bin_seeding=True)
meanshift_estimator.fit(X)
labels = meanshift_estimator.labels_
centroids = meanshift_estimator.cluster_centers_
num_clusters = len(np.unique(labels))

print "\nNumber of clusters in input data =", num_clusters

print "\nCentroids of clusters:"
print '\t'.join([name[:3] for name in names])
for centroid in centroids:
    print '\t'.join([str(int(x)) for x in centroid])

################
# Visualizing data
Example #58
0
def find_clusters(tracks):
    """Find clusters in tracked points."""
    tracks = list(map(lambda x: [x[-1][0], x[-1][1]], tracks))
    ms = MeanShift(bandwidth=30, bin_seeding=True)
    ms.fit(tracks)
    return ms.cluster_centers_
Example #59
0
def compute_clustering(x_data: pd.Series, y_data: pd.Series, method: str,
                       nb_clusters: int) -> np.array:
    """
    Compute clustering using Scikit-learn:
    https://scikit-learn.org/stable/modules/clustering.html
    Several algorithms can be chosen:
        - K-means: https://scikit-learn.org/stable/modules/clustering.html#k-means
        - Affinity propagation: https://scikit-learn.org/stable/modules/generated/sklearn\
        .cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation
        - Mean shift: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.\
        MeanShift.html#sklearn.cluster.MeanShift
        - Spectral clustering: https://scikit-learn.org/stable/modules/generated/sklearn.\
        cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering
        - Hierarchical/Agglomerative clustering: https://scikit-learn.org/stable/modules\
        /generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.Agglomera\
        tiveClustering
        - DBSCAN: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.\
        DBSCAN.html#sklearn.cluster.DBSCAN
        - OPTICS: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.\
        OPTICS.html#sklearn.cluster.OPTICS
        - Bayesian gaussian mixtures: https://scikit-learn.org/stable/modules/generated/\
        sklearn.mixture.BayesianGaussianMixture.html
        - Birch: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.\
        html#sklearn.cluster.Birch

    :param x_data: the x data set
    :param y_data: the y data set
    :param method: name of the algorithm used to perform clustering
    :param nb_clusters: number of clusters used to split data
    :return: the data set labels used to color scatter points
    """
    mapped_data = [row for row in zip(x_data, y_data)]
    if method == "K-means":
        kmeans = KMeans(n_clusters=nb_clusters,
                        random_state=0).fit(mapped_data)
        return kmeans.labels_
    if method == "Affinity propagation":
        clustering = AffinityPropagation().fit(mapped_data)
        return clustering.labels_
    if method == "Mean shift":
        clustering = MeanShift(bandwidth=2).fit(mapped_data)
        return clustering.labels_
    if method == "Spectral clustering":
        clustering = SpectralClustering(n_clusters=nb_clusters,
                                        assign_labels="discretize",
                                        random_state=0).fit(mapped_data)
        return clustering.labels_
    if method == "Ward hierarchical clustering":
        clustering = AgglomerativeClustering(
            n_clusters=nb_clusters).fit(mapped_data)
        return clustering.labels_
    if method == "DBSCAN":
        clustering = DBSCAN(eps=3, min_samples=2).fit(mapped_data)
        return clustering.labels_
    if method == "OPTICS":
        clustering = OPTICS(min_samples=2).fit(mapped_data)
        return clustering.labels_
    if method == "Bayesian gaussian mixtures":
        bgm = BayesianGaussianMixture(n_components=nb_clusters,
                                      max_iter=100,
                                      tol=1e-3,
                                      reg_covar=0)
        bgm.fit(mapped_data)
        return bgm.predict(mapped_data)
    if method == "Birch":
        brc = Birch(n_clusters=nb_clusters)
        brc.fit(mapped_data)
        return brc.predict(mapped_data)
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1
            df[column] = list(map(convert_to_int, df[column]))

    return df


df = handle_non_numerical_data(df)
df.drop(['ticket', 'home.dest'], 1, inplace=True)

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

labels = clf.labels_
cluster_centers = clf.cluster_centers_

original_df['cluster_group'] = np.nan
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[(original_df['cluster_group'] == float(i))]
    survival_cluster = temp_df[(temp_df['survived'] == 1)]
    survival_rate = len(survival_cluster) / len(temp_df)