Beispiel #1
0
def dbscan(fig):
    global X_iris, geo
    ax = fig.add_subplot(geo + 5, projection='3d', title='dbscan')
    dbscan = cluster.DBSCAN()
    dbscan.fit(X_iris)
    res = dbscan.labels_
    core = dbscan.core_sample_indices_
    print(repr(core))
    size = [5 if i not in core else 40 for i in range(len(X_iris))]
    print(repr(size))
    for n, i in enumerate(X_iris):
        ax.scatter(*i[: 3], s=size[n], c='bgrcmyk'[res[n] % 7],
                   alpha=0.8, marker='o')

    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')
    return res
Beispiel #2
0
    def findClusters(self, dfFrame):
        aVX, aVY = np.asarray(dfFrame["SpeedX"]) * 50, np.asarray(
            dfFrame["SpeedY"]) * 50
        aX, aY = np.asarray(dfFrame["AvgPosX"]), np.asarray(dfFrame["AvgPosY"])
        aId = np.asarray(dfFrame["TrackId"])
        nFrame = dfFrame["Frame"].iloc[0]
        # @UnusedVariable
        #         aData = [];
        #         for i in range(len(aX)):
        #             aData.append([aX[i], aY[i]]);
        aData = np.asarray([aX, aY, aVX, aVY]).transpose()
        #,aVX,aVY]).transpose();
        #         print aData;
        #         return;
        #         aData = [[np.asarray([aX]).transpose()], [np.asarray([aY]).transpose()]];
        #         aData = np.asarray([aX,aY]).transpose();
        #         aData = StandardScaler().fit_transform(aData);
        algorithm = cluster.DBSCAN(eps=200, min_samples=2)
        #         algorithm = cluster.SpectralClustering(n_clusters=5, eigen_solver='arpack', affinity="rbf")
        #         algorithm = cluster.MeanShift(bandwidth=40, bin_seeding=True); #print ms.bandwidth;
        #         algorithm = cluster.MiniBatchKMeans(n_clusters=2)
        #         algorithm = mixture.GMM(n_components=3, covariance_type='full', n_iter=100)
        #         aData = aData*0.07-20;
        #         aX, aY = aData[:,0], aData[:,1]
        #         algorithm = mixture.DPGMM(n_components=len(aData), covariance_type='diag', alpha=10, n_iter=1)

        #         colors = np.array([x for x in 'bgrcmybgrcmybgrcmybgrcmyk']);
        #         colors = np.hstack([colors] * 20);
        algorithm.fit(aData)
        #         try: algorithm.fit(aData);
        #         except: print nFrame, aData;
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(aData)
#         y_pred = hcluster.fclusterdata(aData, t=50, criterion="distance")
        y_pred = self.grpActorsMngr.fixGrpsId(y_pred, aId)
        #         y_pred2 = self.grpActorsMngr.fixGrpsId([1,-1,0,-1,-1,0,-1,1],['3','6','75','140','19','149','124','125']);#y_pred, aId);
        #         print y_pred2;
        #         y_pred2 = self.grpActorsMngr.fixGrpsId([0,-1,1,-1,-1,1,-1,0],['3','6','75','140','19','149','124','125']);#y_pred, aId);
        #         print y_pred2;
        #         pl.hold(True);
        # #         self.plotClusters(algorithm, aData, y_pred, aId, aX, aY, nFrame);
        return y_pred
Beispiel #3
0
def _call_kmapper(data, col_names, interval, overlap, clustering_alg, clustering_alg_params, filter_function, filter_parameters=None):
    print(filter_parameters)
    mapper = KeplerMapper()
    if len(col_names) == 1:
        data_new = np.array(data[col_names[0]]).reshape(-1,1)
    else:
        data_new = np.array(data[col_names])

    lens_dict = {}
    if len(filter_function) == 1:
        f = filter_function[0]
        if f in data.columns:
            lens = data[f]
        else:
            lens = compute_lens(f, data_new, mapper, filter_parameters)
        lens_dict[f] = lens
        
    elif len(filter_function) == 2:
        lens = []
        for f in filter_function:
            if f in data.columns:
                lens_f = np.array(data[f]).reshape(-1,1)
            else:
                lens_f = compute_lens(f, data_new, mapper, filter_parameters)
            lens.append(lens_f)
            lens_dict[f] = lens_f
        lens = np.concatenate((lens[0], lens[1]), axis=1)
    # clusterer = sklearn.cluster.DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean', n_jobs=8)
    print(data_new.shape)
    print(np.max(np.max(data_new)))
    print(np.mean(np.mean(data_new)))
    if clustering_alg == "DBSCAN":
        graph = mapper.map_parallel(lens, data_new, clusterer=cluster.DBSCAN(eps=float(clustering_alg_params["eps"]), min_samples=float(clustering_alg_params["min_samples"])), cover=Cover(n_cubes=interval, perc_overlap=overlap))
    elif clustering_alg == "Agglomerative Clustering":
        graph = mapper.map_parallel(lens, data_new, clusterer=cluster.AgglomerativeClustering(n_clusters=None, linkage=clustering_alg_params["linkage"], distance_threshold=float(clustering_alg_params["dist"])), cover=Cover(n_cubes=interval, perc_overlap=overlap))
        # graph = mapper.map_parallel(lens, data_new, clusterer=cluster.AgglomerativeClustering( linkage=clustering_alg_params["linkage"]), cover=Cover(n_cubes=interval, perc_overlap=overlap))
    elif clustering_alg == "Mean Shift":
        graph = mapper.map_parallel(lens, data_new, clusterer=cluster.MeanShift(bandwidth=float(clustering_alg_params["bandwidth"])), cover=Cover(n_cubes=interval, perc_overlap=overlap))
        # graph = mapper.map_parallel(lens, data_new, clusterer=cluster.MeanShift(bandwidth=1), cover=Cover(n_cubes=interval, perc_overlap=overlap))
        
    print(len(graph['nodes'].keys()))
    # graph = mapper.map(lens, data_new, clusterer=cluster.DBSCAN(eps=eps, min_samples=min_samples), cover=Cover(n_cubes=interval, perc_overlap=overlap))

    return graph, lens_dict
Beispiel #4
0
    def get_algorithm(self):

        if(self.algorithmName == "kmeans"):

            cluster_alg = cluster.MiniBatchKMeans(n_clusters=int(self.parms['k']))

        elif(self.algorithmName == "mean_shift"):

            bandwidth = cluster.estimate_bandwidth(self.X, quantile=float(self.parms['quantile']))
            cluster_alg = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

        elif(self.algorithmName == "affinity_propagation"):

            cluster_alg = cluster.AffinityPropagation(damping=float(self.parms['damping']))

        elif(self.algorithmName == "birch"):

            cluster_alg = cluster.Birch(n_clusters=int(self.parms['k']))

        elif(self.algorithmName == "ward"):

            connectivity = kneighbors_graph(self.X, n_neighbors=int(self.parms['n_neighbors']), include_self=False)
            connectivity = 0.5 * (connectivity + connectivity.T)
            cluster_alg = cluster.AgglomerativeClustering(n_clusters=int(self.parms['k']), linkage='ward', connectivity=connectivity)

        elif(self.algorithmName == "spectral"):

            cluster_alg = cluster.SpectralClustering(n_clusters=int(self.parms['k']), eigen_solver='arpack', affinity="nearest_neighbors")

        elif(self.algorithmName == "dbscan"):

            cluster_alg = cluster.DBSCAN(eps=float(self.parms['eps']))

        elif(self.algorithmName == "agglomerative"):

            connectivity = kneighbors_graph(self.X, n_neighbors=int(self.parms['n_neighbors']), include_self=False)
            connectivity = 0.5 * (connectivity + connectivity.T)

            cluster_alg = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=int(self.parms['k']), connectivity=connectivity)

        else:
            return None

        return cluster_alg
	def DBSCAN(self):
		"""
		Uses `sklearn's DBSCAN <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_

		**Defaults and var_params:** sklearn.cluster.DBSCAN(eps=0.5, min_samples=5, metric='euclidean', algorithm='auto', leaf_size=30, p=None, n_jobs=1)
		
		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)
		
		"""
		params = {}
		params['distance'] = 'euclidean'
		params['eps']=0.5
		params['min_samples']=5
		params['metric']='precomputed'
		params['algorithm']='auto'
		params['leaf_size']=30
		params['p']=None, 
		params['n_jobs'] = 1

		params = returnParams(self.var_params, params, 'DBSCAN')

		if 'distance' in self.var_params:
			if self.var_params['distance'] == 'precomputed':
				d = self.var_params['M']
			else:
				d = returnDistanceMatrix(self.data, params['distance'])        
		else:
			d = returnDistanceMatrix(self.data, params['distance'])        

		solution = skc.DBSCAN(eps=params['eps'], min_samples=params['min_samples'], metric=params['metric'], 
			algorithm=params['algorithm'], leaf_size=params['leaf_size'], 
			p=params['p'], n_jobs=params['n_jobs']) 
		solution.fit(d)
		self.out = solution.labels_
		self.var_params = params #update dictionary of parameters to match that used.
Beispiel #6
0
 def find_objpcd_list_by_pos(self,
                             pcd,
                             x_range=(200, 800),
                             y_range=(0, 600),
                             z_range=(790, 1000),
                             eps=5,
                             toggledebug=False,
                             scan_num=1):
     real_pcd = pcdu.trans_pcd(pcd, self.amat)
     # pcdu.show_pcd([p for p in real_pcd if p[2] < 900], rgba=(.5, .5, .5, .1))
     # base.run()
     pcd_result = []
     for p in real_pcd:
         if x_range[0] < p[0] < x_range[1] and y_range[0] < p[1] < y_range[
                 1] and z_range[0] < p[2] < z_range[1]:
             pcd_result.append(p)
     pcd_result = np.array(pcd_result)
     db = skc.DBSCAN(eps=eps, min_samples=50 * scan_num).fit(pcd_result)
     core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
     core_samples_mask[db.core_sample_indices_] = True
     labels = db.labels_
     n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
     print("n_clusters:", n_clusters)
     unique_labels = set(labels)
     objpcd_list = []
     for k in unique_labels:
         if k == -1:
             continue
         else:
             class_member_mask = (labels == k)
             temppartialpcd = pcd_result[class_member_mask
                                         & core_samples_mask]
             if len(temppartialpcd) > 500:
                 objpcd_list.append(temppartialpcd)
     if toggledebug:
         # pcdu.show_pcd(real_pcd, rgba=(1, 1, 1, .1))
         pcdu.show_pcd(pcd_result, rgba=(1, 1, 0, 1))
         for objpcd in objpcd_list:
             pcdu.show_pcd_withrbt(objpcd,
                                   rgba=(choice([0, 1]), choice([0,
                                                                 1]), 1, 1))
         base.run()
     return objpcd_list
    def _set_parameters(self, **kwargs):
        '''
        Sets parameters used in fitting tracks::

            vd: drift velocity [mm/us]
            clock_period: clock period for timestamp [us]
            dbscan_eps: epsilon used for clustering [mm]
            dbscan_min_samples: min samples used for clustering

        '''
        self._vd = kwargs.get('vd', self._vd)
        self._clock_period = kwargs.get('clock_period', self._clock_period)
        self._z_scale = self._vd * self._clock_period

        self._dbscan_eps = kwargs.get('dbscan_eps', self._dbscan_eps)
        self._dbscan_min_samples = kwargs.get('dbscan_min_samples',
                                              self._dbscan_min_samples)
        self.dbscan = cluster.DBSCAN(eps=self._dbscan_eps,
                                     min_samples=self._dbscan_min_samples)
Beispiel #8
0
def bdscan(multishapes):
    db = cluster.DBSCAN(eps=0.3, min_samples=60)
    db.fit(multishapes)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True

    unique_labels = set(db.labels_)
    print(unique_labels)

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(unique_labels) - (1 if -1 in db.labels_ else 0)

    fig = plt.figure(figsize=(8, 6))
    colors = [
        '#ff0000', '#00ff00', '#0000ff', '#ff00ff', '#00ffff', '#ffff00',
        '#f6ff00', '#2f800f', '#a221b5', '#21b5ac', '#b1216c'
    ]

    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'

        my_members = db.labels_ == k

        xy = multishapes[my_members & core_samples_mask]
        plt.plot(xy[:, 0],
                 xy[:, 1],
                 'o',
                 markerfacecolor=col,
                 markeredgecolor='k',
                 markersize=11)

        xy = multishapes[my_members & ~core_samples_mask]
        plt.plot(xy[:, 0],
                 xy[:, 1],
                 'o',
                 markerfacecolor=col,
                 markeredgecolor='k',
                 markersize=6)

    plt.title('Número estimado de clusters: %d' % n_clusters_)
    plt.show()
Beispiel #9
0
def update_data(attrname, old, new):

    # Get the drop down values
    algorithm = dropdown.value
    global X

    # Generate the new colors:
    if algorithm == 'MiniBatchKMeans':
        model = cluster.MiniBatchKMeans(n_clusters=2)
    elif algorithm == 'AffinityPropagation':
        model = cluster.AffinityPropagation(damping=.9, preference=-200)
    elif algorithm == 'MeanShift':
        model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    elif algorithm == 'SpectralClustering':
        model = cluster.SpectralClustering(n_clusters=2,
                                           eigen_solver='arpack',
                                           affinity="nearest_neighbors")
    elif algorithm == 'Ward':
        model = cluster.AgglomerativeClustering(n_clusters=2,
                                                linkage='ward',
                                                connectivity=connectivity)
    elif algorithm == 'AgglomerativeClustering':
        model = cluster.AgglomerativeClustering(linkage="average",
                                                affinity="cityblock",
                                                n_clusters=2,
                                                connectivity=connectivity)
    elif algorithm == 'Birch':
        model = cluster.Birch(n_clusters=2)
    elif algorithm == 'DBSCAN':
        model = cluster.DBSCAN(eps=.2)
    else:
        print('No Algorithm selected')
    model.fit(X)

    if hasattr(model, 'labels_'):
        y_pred = model.labels_.astype(np.int)
    else:
        y_pred = model.predict(X)

    colors = [Spectral6[i] for i in y_pred]

    source.data['colors'] = colors
    plot.title = algorithm
Beispiel #10
0
def train_model(x):
    epsilons = np.linspace(0.3, 1.2, 10)
    scores = []
    models = []
    for epsilon in epsilons:
        model = sc.DBSCAN(eps=epsilon, min_samples=5).fit(x)
        scores.append(
            ms.silhouette_score(x,
                                model.labels_,
                                sample_size=len(x),
                                metric='euclidean'))
        models.append(model)
    scores = np.array(scores)
    best_index = scores.argmax()
    best_epsilon = epsilons[best_index]
    best_score = scores[best_index]
    best_model = models[best_index]
    print(best_epsilon, best_score)
    return best_model
def cluster_pipelines2(clustercount):
    return {            
        'Ward': Pipeline([                                
            ('sca', preprocessing.MaxAbsScaler()),
            ('clu', cluster.AgglomerativeClustering(n_clusters=clustercount, linkage='ward')),
        ]),    
        'K-Means': Pipeline([                
            ('sca', preprocessing.MaxAbsScaler()),
            ('clu', cluster.KMeans(n_clusters=clustercount, init='k-means++', max_iter=100, n_init=1)),
        ]),        
        'GMM': Pipeline([                        
            ('sca', preprocessing.MaxAbsScaler()),
            ('clu', mixture.GaussianMixture(n_components=clustercount)),
        ]),
        'DBScan': Pipeline([          
            ('sca', preprocessing.MaxAbsScaler()),
            ('clu', cluster.DBSCAN(eps=0.1, min_samples=20)),
        ]),         
    }
def cluster_face_features(feature_list,
                          method=None,
                          precomputed=True,
                          eps=0.5):
    if feature_list is not None:
        face_feature_list = feature_list

    if face_feature_list is None:
        return None

    if precomputed:
        metric_type = 'precomputed'
        dist_matrix = __compute_pairwise_distance(face_feature_list)
        dist_matrix = dist_matrix
    else:
        metric_type = 'euclidean'
        dist_matrix = np.vstack(face_feature_list)
        dist_matrix = None

    if method == 'AP':
        cluster_estimator = cluster.AffinityPropagation(affinity=metric_type,
                                                        damping=.55,
                                                        preference=-1)
        if precomputed:
            dist_matrix = -dist_matrix
    elif method == 'DBSCAN':
        cluster_estimator = cluster.DBSCAN(metric=metric_type,
                                           eps=eps,
                                           min_samples=1)

    t0 = time.time()
    cluster_estimator.fit(dist_matrix)
    t1 = time.time()

    t = t1 - t0
    print 'Clustering takes: %f seconds' % t

    if hasattr(cluster_estimator, 'labels_'):
        y_pred = cluster_estimator.labels_.astype(np.int)
    else:
        y_pred = cluster_estimator.predict(dist_matrix)

    return y_pred
Beispiel #13
0
def clustering(X, algorithm, n_clusters):
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)
    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    # Generate the new colors:
    if algorithm=='MiniBatchKMeans':
        model = cluster.MiniBatchKMeans(n_clusters=n_clusters)
    elif algorithm=='AffinityPropagation':
        model = cluster.AffinityPropagation(damping=.9, preference=-200)
    elif algorithm=='MeanShift':
        model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    elif algorithm=='SpectralClustering':
        model = cluster.SpectralClustering(n_clusters=n_clusters,
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    elif algorithm=='Ward':
        model = cluster.AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',
                                           connectivity=connectivity)
    elif algorithm=='AgglomerativeClustering':
        model = cluster.AgglomerativeClustering(
            linkage="average", affinity="cityblock", n_clusters=n_clusters,
            connectivity=connectivity)
    elif algorithm=='Birch':
        model = cluster.Birch(n_clusters=n_clusters)
    elif algorithm=='DBSCAN':
        model = cluster.DBSCAN(eps=.2)
    else:
        print('No Algorithm selected. Default is MiniBatchKMeans')
        model = cluster.MiniBatchKMeans(n_clusters=n_clusters)
    model.fit(X)

    if hasattr(model, 'labels_'):
            y_pred = model.labels_.astype(np.int)
    else:
            y_pred = model.predict(X)

    return X, y_pred
def main():
    data_origin = read_data('balance-scale.data')
    data_converted = convert_data(data_origin, 0)
    true_labels = data_converted.iloc[:, 0]
    data_clean = clean_data(data_converted.iloc[:, 1:])
    plot_distribution(data_clean)

    dimension = 2
    data_pca = pca(dimension, data_clean)
    plot_distribution(data_pca)

    n_clusters = 3
    dimension_show = [1, 2]

    kmeans = cluster.KMeans(n_clusters=n_clusters).fit(data_clean)
    show_result(data_clean, data_pca, true_labels, kmeans, n_clusters, dimension_show)

    dbscan = cluster.DBSCAN(eps=0.38, min_samples=10).fit(data_clean)
    show_result(data_clean, data_pca, true_labels, dbscan, n_clusters, dimension_show)
    return 0
def delete_redudants(predictions, embed_model):
    X = []
    logging.info('\n# Redundancy reduction ...\n')
    for i, row in predictions.iterrows():
        X.append(get_sentence_vector(row['processed_text'], embed_model))
    dbscan = cluster.DBSCAN(eps=0.09, metric='cosine', min_samples=2).fit(X)
    labels = dbscan.labels_
    logging.info('\n# Labels\n')
    print(labels)
    predictions['label'] = labels
    isolated_tweets = predictions[predictions.label == -1]
    predictions = predictions[predictions.label != -1].drop_duplicates(
        'label', keep='first')
    predictions = pd.concat([predictions, isolated_tweets], sort=True)
    predictions = predictions.sort_values('score', ascending=False)
    predictions.reset_index(inplace=True)
    predictions = predictions.drop(columns='processed_text')
    logging.info(
        f'\n# Redundancy reduction [OK]\n >> Length= {len(predictions)}\n')
    return predictions
Beispiel #16
0
def dbscan_seeds(goods, bads):
    """Find regions with concentration of good points."""
    from scipy.spatial import ConvexHull
    import sklearn.cluster as cl
    good_ids, good_loc = goods
    bad_ids, bad_loc = bads
    labels = cl.DBSCAN(eps=150, min_samples=8).fit_predict(good_loc)
    gcluster = []
    bcluster = []
    hulls = []
    for cluster in range(len(np.unique(labels)) - 1):
        points = good_loc[labels == cluster, :]
        hull = sgeo.Polygon(points[ConvexHull(points).vertices])
        gcluster.append(list(i.compress(good_ids, labels == cluster)))
        bcluster.append([
            id_ for id_, loc in zip(bad_ids, bad_loc)
            if hull.contains(sgeo.Point(loc))
        ])
        hulls.append(hull)
    return hulls, gcluster, bcluster
def do_dbscan(data):

    print(" Do dbscan...")

    # Récupération des paramètres

    eps = args.associer_param('eps', 1)
    min_pts = args.associer_param('min_pts', 1)

    model = cluster.DBSCAN(eps=eps,
                           min_samples=min_pts,
                           metric=args.args.distance)

    labels = model.fit_predict(data)

    data['cluster'] = labels

    print("   ok !")

    return data
Beispiel #18
0
 def clustering(self, image_urls, min_samples=2, eps=0.4, pick_up_num=3):
   train = self.get_train(image_urls)
   print(train)
   if len(train) < min_samples:
     return None
   distances = self.calculate_distance(train)
   if distances is None:
     return None
   cls = cluster.DBSCAN(metric='precomputed', min_samples=min_samples, eps=eps)
   y = cls.fit_predict(distances)
   val = pd.Series(y).value_counts()
   target_clusters_index = [x for x in list(val.index) if x != -1][:pick_up_num]
   order = {key: i for i, key in enumerate(target_clusters_index)}
   picked_up = dict([(index, val) for (index, val) in enumerate(y.tolist()) if val in target_clusters_index])
   picked_up_ = [(order[x2], image_urls[x1]) for (x1, x2) in sorted(picked_up.items(), key=lambda x: order[x[1]])]
   ret = []
   for key, subiter in itertools.groupby(picked_up_, operator.itemgetter(0)):
     vals = [item[1] for item in subiter]
     ret.append({"row_id": int(key), "sumples_num": len(vals), "vals": vals})
   return ret
 def cluster(self, params={"alg": "KMeans", "num": 10}):
     start = time.time()
     encodedLog = self.encodedLog.values.tolist()
     if params["alg"].lower() == "kmeans":
         if not "runs" in params:
             params["runs"] = 0
         cluster = TTestKMeans2(params["num"], encodedLog)
         print("SSE : ", cluster.inertia_)
         print("Clustering Time:", time.time() - start)
         return cluster.predict(encodedLog), cluster.cluster_centers_
     elif params["alg"].lower() == "dbscan":
         cluster = skcl.DBSCAN(min_samples=params["minsamples"],
                               eps=params["eps"]).fit(encodedLog)
         y_pred = cluster.labels_
         centers = calcCenters(y_pred, encodedLog)
         print("Clustering Time:", time.time() - start)
         if "assignNoisy" in params and params["assignNoisy"] == True:
             y_pred, centers = assignNoisyPoints(y_pred, encodedLog,
                                                 centers)
         return y_pred, centers
def cluster_pipelines(clustercount, featuredim, decompstr):
    decomp = clustervis_pipelines(featuredim)[decompstr]    
    return {            
        'Ward': Pipeline([                                
            ('decomp', decomp),
            ('clu', cluster.AgglomerativeClustering(n_clusters=clustercount, linkage='ward')),
        ]),    
        'K-Means': Pipeline([                
            ('decomp', decomp),
            ('clu', cluster.KMeans(n_clusters=clustercount, init='k-means++', max_iter=100, n_init=1)),
        ]),        
        'GMM': Pipeline([                        
            ('decomp', decomp),
            ('clu', mixture.GaussianMixture(n_components=clustercount)),
        ]),
        'DBScan': Pipeline([          
            ('decomp', decomp),
            ('clu', cluster.DBSCAN(eps=0.1, min_samples=20)),
        ]),         
    }
Beispiel #21
0
def _optimize_eps(X, eps, Param, verbose=3):
    if verbose >= 3: print('[clusteval] >Evaluate using silhouette..')

    # Setup resolution
    eps = np.arange(0.1, 5, 1 / Param['epsres'])
    silscores = np.zeros(len(eps)) * np.nan
    sillclust = np.zeros(len(eps)) * np.nan
    silllabx = []

    # Run over all Epsilons
    for i in tqdm(range(len(eps))):
        # DBSCAN
        db = cluster.DBSCAN(eps=eps[i],
                            metric=Param['metric'],
                            min_samples=Param['min_samples'],
                            n_jobs=Param['n_jobs']).fit(X)
        # Get labx
        labx = db.labels_

        # Fill array
        sillclust[i] = len(np.unique(labx))
        # Store all labx
        silllabx.append(labx)
        # Compute Silhouette only if more then 1 cluster
        if sillclust[i] > 1:
            silscores[i] = silhouette_score(X, db.labels_)

    # Convert to array
    silllabx = np.array(silllabx)
    # Store only if agrees to restriction of input clusters number
    I1 = np.isnan(silscores) == False
    I2 = sillclust >= Param['min_clust']
    I3 = sillclust <= Param['max_clust']
    Iloc = I1 & I2 & I3
    # Get only those of interest
    silscores = silscores[Iloc]
    sillclust = sillclust[Iloc]
    eps = eps[Iloc]
    silllabx = silllabx[Iloc, :]
    # Return
    return (eps, sillclust, silscores, silllabx)
Beispiel #22
0
def main():
    # Create random data.
    n = 1500  # nb circles.
    for i, x_y in enumerate([
            datasets.make_circles(n, factor=.5, noise=.05),
            datasets.make_moons(n_samples=n, noise=.05)
    ]):
        x, y = x_y

        # Scale data to reduce weights.
        # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables
        std_scale = preprocessing.StandardScaler().fit(x)
        x_scaled = std_scale.transform(x)

        # Perform DBSCAN on scaled data.
        range_eps = [0.05, 0.1, 0.2, 0.3]
        range_n_min = [5, 10, 20, 30]
        nb_plots = len(range_eps) + 1  # +1: add true clusters.
        for j, eps_n_min in enumerate(zip(range_eps, range_n_min)):
            # Perform DBSCAN on scaled data.
            e, n_min = eps_n_min
            cls = cluster.DBSCAN(eps=e, min_samples=n_min)
            cls.fit(x_scaled)

            # Plot DBSCAN.
            axis = plt.subplot(2, nb_plots, 1 + j + nb_plots * i)
            axis.scatter(x_scaled[:, 0], x_scaled[:, 1], c=cls.labels_, s=50)
            axis.set_title('eps %04.2f, n_min %02d' % (e, n_min))

        # Plot true clusters.
        axis = plt.subplot(2, nb_plots, nb_plots + nb_plots * i)
        axis.scatter(x_scaled[:, 0], x_scaled[:, 1], c=y, s=50)
        axis.set_title('true clusters')
    plt.subplots_adjust(left=0.1,
                        bottom=0.1,
                        right=0.9,
                        top=0.9,
                        wspace=0.3,
                        hspace=0.3)
    plt.suptitle('DBSCAN')
    plt.show()
Beispiel #23
0
def cluster_topics():
    #model = cluster.Birch(
    #branching_factor=2,
    #threshold=0.002 # Lower = more clusters, higher = fewer clusters
    #)

    #model = cluster.KMeans(
    #branching_factor=10,
    #threshold=0.1 # Lower = more clusters, higher = fewer clusters
    #)

    model = cluster.DBSCAN(min_samples=2, eps=0.2)

    #model = cluster.AffinityPropagation(
    #)

    vectorizer = text.HashingVectorizer(
        analyzer='char_wb',  # The feature is made of words not characters
        norm='l2',  # Normalize the words
        lowercase=True,  # Converts everything to lowercase
        stop_words=stopwords)

    num_samples = 10000
    offset = 0

    while True:
        log.debug(u"Loading topics...")
        topic_rows = db.session.query(
            models.TopicModel.id,
            models.TopicModel.topic).filter_by(clustered=False).order_by(
                models.TopicModel.id.asc()).limit(num_samples).offset(
                    offset).all()

        if not topic_rows:
            break

        log.debug(u"Loaded {} topics".format(len(topic_rows)))

        offset += len(topic_rows)

        go_cluster(vectorizer, model, topic_rows)
Beispiel #24
0
def _get_cluster_dict(peak_array, eps=30, min_samples=2):
    """Sort peaks into cluster using sklearn's DBSCAN.

    Each cluster is given its own label, with the unclustered
    having the label -1.

    Parameters
    ----------
    peak_array : 2D numpy array
        In the form [[x0, y0], [x1, y1], ...], i.e. shape = 2
    eps : scalar
        For the DBSCAN clustering algorithm
    min_samples : int
        Minimum number of peaks in each cluster

    Returns
    -------
    cluster_dict : dict
        The peaks are sorted into a dict with the cluster label as the key.

    Example
    -------
    >>> import numpy as np
    >>> peak_array = np.random.randint(1000, size=(100, 2))
    >>> import pyxem.utils.cluster_tools as ct
    >>> cluster_dict = ct._get_cluster_dict(peak_array)
    >>> cluster0 = cluster_dict[0]

    """
    dbscan = cluster.DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(peak_array)
    label_list = dbscan.labels_

    label_unique_list = sorted(list(set(label_list)))
    cluster_dict = {}
    for label_unique in label_unique_list:
        cluster_dict[label_unique] = []

    for peak, label in zip(peak_array, label_list):
        cluster_dict[label].append(peak.tolist())
    return cluster_dict
Beispiel #25
0
def dbscan(data, eps=0.3, min_samples=10):
    """DBScan clustering

    Parameters
    ----------
    data : float array
        features array

    Returns
    -------
    cl : int array
        cluster indicies

    Notes
    -----
    This function requires scikits-learn
    """

    db = skcluster.DBSCAN(eps=eps, min_samples=min_samples).fit(data)
    labels = db.labels_
    return labels
def DBSCAN(P, eps=15, minpts=10):
    pointlist = []
    for y in range(P.shape[1]):
        for x in range(P.shape[0]):
            if P[x, y] > 0:
                pointlist.append([x, y])
    pointlist = np.array(pointlist)
    db = skc.DBSCAN(eps, minpts).fit(pointlist)
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    cluster_list = []
    #找到最大的簇并标在图上
    for i in range(n_clusters_):
        one_cluster = pointlist[labels == i]
        cluster_list.append([len(one_cluster), one_cluster])
    cluster_list.sort(key=lambda x: x[0], reverse=True)
    P = np.zeros((P.shape[0], P.shape[1]))
    for pixel in cluster_list[0][1]:
        P[pixel[0], pixel[1]] = 1

    return P
 def __apply_cluster_algorithms__(self, x):
     if self.algorithms == 'k-mean':
         kmeans = cluster.KMeans(n_clusters=3)
         kmeans.fit(x)
         self.labels = kmeans.labels_
         for i, label in enumerate(kmeans.labels_):
             self.clusterid_docids_map[
                 label] = self.clusterid_docids_map.get(label, []) + [i]
     elif self.algorithms == 'dbscan':
         dbscan = cluster.DBSCAN(eps=2, min_samples=3)
         dbscan.fit(x)
         self.labels = dbscan.labels_
         for i, label in enumerate(dbscan.labels_):
             self.clusterid_docids_map[
                 label] = self.clusterid_docids_map.get(label, []) + [i]
     else:
         sm_cluster = SparseMatrixClustering(
             cluster_sim_threshold=0.8, graph_manager=self.graph_manager)
         sm_cluster.fit(x)
         self.score_mat = sm_cluster.score_mat
         self.clusterid_docids_map = sm_cluster.clusterid_docids_mapping
Beispiel #28
0
def getDataPandas():
    reader = pd.DataFrame()
    reader = pd.read_table(r'.\dbscanData.txt', header=None,
                           sep=' ')  #iterator=True,chunksize=1000)
    print reader

    #hello=reader.iloc[0]
    reader = reader.T  #转置
    print reader
    f3 = lambda x: x / x.sum()
    reader = reader.apply(f3)
    reader = reader.T  #转回来
    print reader

    dbscan = cluster.DBSCAN(eps=0.3,
                            min_samples=3,
                            algorithm='brute',
                            metric='euclidean')
    dbscan.fit(reader)
    res = dbscan.labels_
    print res
Beispiel #29
0
def filter_isolated_idxs(idxs=[], maxdist=3.0):
    newidxs = idxs.copy()

    seq = np.where(idxs==True)[0]

    if len(seq):
        X = np.array(zip(seq, np.zeros(len(seq))))
        cfn = cluster.DBSCAN(eps=3, min_samples=1)
        cfn.fit(X)

        clusters = {}
        for v in np.unique(cfn.labels_):
            clusters[v] = len(cfn.labels_[cfn.labels_==v])

        maxv = sorted(clusters, key=clusters.get)[-1]

        for i in xrange(len(cfn.labels_)):
            if cfn.labels_[i] != maxv:
                newidxs[seq[i]] = False

    return newidxs
def process(img: Image):
    img.thumbnail((200, 200))
    data = np.array(img.getdata())
    least_clusters = None
    clusters = None
    for i in np.linspace(5, 7, 5):
        db = cluster.DBSCAN(eps=i, min_samples=10).fit(data)
        ln = len(set(db.labels_))
        if least_clusters is None or least_clusters > ln:
            least_clusters = ln
            clusters = db.labels_
            if least_clusters <= 7:
                break

    result = []
    for i in set(clusters):
        result.append(
            list(
                map(int, list(np.round(np.average(data[clusters == i],
                                                  axis=0))))))  #govnokod
    return result