def clusterMalwareNames(malwareNames): # strictly lexical clustering over malware-names wordCount = {} # create a distance matrix matrix = np.zeros((len(malwareNames), len(malwareNames))) for i in range(len(malwareNames)): for j in range(len(malwareNames)): if matrix[i, j] == 0.0: matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j]) matrix[j, i] = matrix[i, j] # Scikit-Learn's DBSCAN implementation to cluster the malware-names clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed") clust.fit(matrix) preds = clust.labels_ clabels = np.unique(preds) # create Word-Count Map for i in range(clabels.shape[0]): if clabels[i] < 0: continue cmem_ids = np.where(preds == clabels[i])[0] cmembers = [] for cmem_id in cmem_ids: cmembers.append(malwareNames[cmem_id]) wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids) return wordCount
def cluster(): eps_set = 0.5 * np.arange(1, 7) npt_set = np.arange(1, 6) scores = [] global res res = [] for eps in eps_set: for npt in npt_set: est = DBSCAN(eps=eps, min_samples=npt) est.fit(x) ari = metrics.adjusted_rand_score(y, est.labels_) scores.append(ari) n_noise = len([ l for l in est.labels_ if l == -1]) res.append((ari, np.max(est.labels_) + 1 , n_noise)) print ari max_score = np.max(scores) max_idx = scores.index(max_score) max_eps = eps_set[max_idx / len(npt_set)] max_npt = npt_set[max_idx % len(npt_set)] print max_score, max_eps, max_npt scores = np.array(scores).reshape(len(eps_set), len(npt_set)) pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral) pl.colorbar() pl.xticks(np.arange(len(npt_set)), npt_set) pl.yticks(np.arange(len(eps_set)), eps_set) pl.ylabel('eps') pl.xlabel('min_samples') pl.show()
def find_tracks(data, eps=20, min_samples=20): """Applies the DBSCAN algorithm from scikit-learn to find tracks in the data. Parameters ---------- data : array-like An array of (x, y, z, hits) data points eps : number, optional The minimum distance between adjacent points in a cluster min_samples : number, optional The min number of points in a cluster Returns ------- tracks : list A list of tracks. Each track is an ndarray of points. """ xyz = data[:, 0:3] dbs = DBSCAN(eps=eps, min_samples=min_samples) dbs.fit(xyz) tracks = [] for track in (np.where(dbs.labels_ == n)[0] for n in np.unique(dbs.labels_) if n != -1): tracks.append(data[track]) return tracks
def test(): global est est = DBSCAN(eps=1, min_samples=1) est.fit(x) print est.labels_ ari = metrics.adjusted_rand_score(y, est.labels_) print ari
def train_dbscan(): print "starting dbscan clustering..." model = DBSCAN(eps=dbs_eps, min_samples=dbs_min_samples, metric=dbs_metric, algorithm='auto') model.fit(X) core_ponts = model.core_sample_indices_ if output_core_points: print "core points data index" print core_points print "num of core points %d" %(len(core_ponts)) print "all points clutser index" cluster_index = model.labels_ if output_cluster_members: #print cluster_index cluster_members = {} for i,c in enumerate(cluster_index): index_list = cluster_members.get(c, list()) index_list.append(i) cluster_members[c] = index_list for cl, indx_list in cluster_members.iteritems(): if cl > 0: print "cluster index %d size %d" %(cl, len(indx_list)) else: print "noise points size %d" %(len(indx_list)) print indx_list print "num of clusters %d" %(cluster_index.max() + 1)
def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time): BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X] labels = None if clusterType == 'kmeans': kmeans = KMeans(n_clusters=N_CLUSTERS) kmeans.fit(data) labels = kmeans.labels_ elif clusterType == 'affinity_propagation': ap = AffinityPropagation(damping=0.75) ap.fit(data) labels = ap.labels_ N_CLUSTERS = np.max(self.labels)+1 elif clusterType == 'DBSCAN': dbscan = DBSCAN() dbscan.fit(data) labels = dbscan.labels_ N_CLUSTERS = np.max(labels)+1 print 'N_CLUSTERS=' + str(N_CLUSTERS) elif clusterType == 'AgglomerativeClustering': ac = AgglomerativeClustering(n_clusters=N_CLUSTERS) ac.fit(data) labels = ac.labels_ else: print 'ERROR: clusterType: ' + clusterType + ' is not recognized' return (labels, N_CLUSTERS)
def cluster_mappings(vector_inpath, do_pca=False, target_dim=100, indices_inpath=None, epsilon=2.5, min_s=20): # TODO: CLustering parameters # TODO: Metric cosine similarity or euclidian distance print alt("Load mappings...") indices, model = load_mappings_from_model(vector_inpath) X = numpy.array([model[key] for key in indices]) # del model if do_pca: print alt("Truncate vectors with PCA to %i dimensions..." %(target_dim)) pca = PCA(n_components=target_dim) pca.fit(X) X = pca.transform(X) print alt("Cluster points...") # k = 2 * X[0].shape[0] - 1 # min_pts = k + 1 #dbscan = DBSCAN(eps=0.1, min_samples=20, metric='cosine',algorithm='brute') dbscan = DBSCAN(eps=epsilon, min_samples=min_s) dbscan.fit(X) labels = dbscan.labels_ print get_cluster_size(labels) print alt("Finished clustering!") sscore = silhouette_score(X, labels) print("Silhouette Coefficient: %0.3f" %(sscore)) if indices_inpath: resolve_indices(indices, labels, indices_inpath, model)
def dbscan_algo(self,cluster,X=None): if self.dMetric=='levenstein': clust = DBSCAN(eps=self.epsilon,min_samples=1,metric="precomputed") clust.fit(X) else: vectorizer = TfidfVectorizer().fit_transform(cluster) dataX = TfidfTransformer(norm='l1',smooth_idf=True,use_idf=True,sublinear_tf=False).fit_transform(vectorizer) clust = DBSCAN(eps=self.epsilon,metric="cosine",min_samples=3,algorithm='brute') clust.fit(dataX) companyNames = cluster preds = clust.labels_ clabels = np.unique(preds) for i in range(clabels.shape[0]): if clabels[i] < 0: continue cmem_ids = np.where(preds==clabels[i])[0] cmembers = [] for cmem_id in cmem_ids: cmembers.append(companyNames[cmem_id]) clusteritems = ",".join(cmembers) print clusteritems if len(cmem_ids) > 1: self.result.write("Clustered: %s"%clusteritems) self.result.write('\n')
def on_squaremsg_received(self, msg): detected_squares = [] for square_msg in msg.squares: detected_squares.append(TrackedSquare.from_msg(square_msg)) self._prev_squares.append(detected_squares) all_squares = list(itertools.chain.from_iterable(self._prev_squares)) square_centers = [list(s.center) + [s.hue] for s in all_squares] data = np.array(square_centers) ms = DBSCAN(eps=64, min_samples=3) ms.fit(data) labels = ms.labels_ ts_msg = TrackedSquares() for i, s in enumerate(all_squares): label = np.int0(labels[i]) if label < 0: continue s.tracking_colour = TrackedSquare.TRACKING_COLOURS[label % len(TrackedSquare.TRACKING_COLOURS)] s.tracking_detected = True ts_msg.squares.append(s.to_msg()) self._squares_pub.publish(ts_msg)
def cluster_dbscan(self, calpha=False, cluster_diameter=6, cluster_min_size=10): ''' cluster the residues using the DBSCAN method. The parameters here are neighborhood diameter (eps) and neighborhood connectivity (min_samples). Returns a list of cluster labels, in which label ``-1`` means an outlier point, which doesn't belong to any cluster. ''' if not self.positive_residues: return {} if calpha: data_atoms = self.positive_residues.select('ca') else: data_atoms = self.positive_residues.select('sidechain or ca').copy() assert ( data_atoms.getHierView().numResidues() == self.positive_residues.getHierView().numResidues() ) OUTLIER_LABEL = -1 db_clust = DBSCAN(eps=cluster_diameter, min_samples=cluster_min_size) db_clust.fit(data_atoms.getCoords()) db_labels = db_clust.labels_.astype(int) #print db_labels, len(db_labels) if calpha: residue_labels = db_labels else: residues = list(data_atoms.getHierView().iterResidues()) residue_labels = np.zeros(len(residues), dtype=int) def most_common(lst): lst = list(lst) return max(set(lst) or [OUTLIER_LABEL], key=lst.count) data_atoms.setBetas(db_labels) for i, res in enumerate(residues): atom_labels = res.getBetas() residue_labels[i] = most_common(atom_labels[atom_labels!=OUTLIER_LABEL]) assert len(residue_labels) == self.positive_residues.getHierView().numResidues() residue_numbers = self.positive_residues.ca.getResnums() clusters = sorted( [residue_numbers[residue_labels==i] for i in set(residue_labels) if i!=-1], key=self.conf_sum, reverse=True, ) return dict(enumerate(clusters))
def fit(fvecs, params): eps_ = int(params[0]) min_s = int(params[1]) metric_=params[2] # affinity : “euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or ‘precomputed’ model = DBSCAN(eps=eps_, min_samples=min_s, metric=metric_) model.fit(fvecs) print len(set(model.labels_)) return model.labels_
def dbscan(self, eps=0.75, min_samples=3): """ :param kwargs: key-value arguments to pass to DBSCAN (eps: max dist between points in same neighbourhood, min_samples: number of points in a neighbourhood) :return: """ est = DBSCAN(metric='precomputed', eps=eps, min_samples=min_samples) est.fit(self.get_dm(False)) return Partition(est.labels_)
def db(lngs, lats, city, cluster_diameter): city_area = city["area"] city_lng = city["lng"] city_lat = city["lat"] lngs = np.array(lngs)*math.cos(city_lat) dbscan = DBSCAN(metric='euclidean') dbscan.fit(np.array([lngs, lats]).transpose()) cluster_labels = np.array(dbscan.labels_) return labels_to_index(cluster_labels)
def define_clusts(similarity_matrix, threshold=0.05, max_iter=200, method='ap'): """Define clusters given the similarity matrix and the threshold.""" n, labels = connected_components(similarity_matrix, directed=False) prev_max_clust = 0 print("connected components: %d" % n) clusters = labels.copy() if method == 'dbscan': ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1) if method == 'ap': ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter, preference='median') for i in range(n): idxs = np.where(labels == i)[0] if idxs.shape[0] > 1: sm = similarity_matrix[idxs][:, idxs] sm += sm.T + scipy.sparse.eye(sm.shape[0]) # Hierarchical clustering if method == 'hc': dists = squareform(1 - sm.toarray()) links = fastcluster.linkage(dists, method='ward') try: clusters_ = fcluster(links, threshold, 'distance') except ValueError as err: logging.critical(err) clusters_ = np.zeros(1, dtype=int) # DBSCAN elif method == 'dbscan': db = ap.fit(1. - sm.toarray()) # Number of clusters in labels, ignoring noise if present. clusters_ = db.labels_ # n_clusters_ = len(set(clusters_)) - int(0 in clusters_) # AffinityPropagation # ap = AffinityPropagation(affinity='precomputed') elif method == 'ap': db = ap.fit(sm) clusters_ = db.labels_ else: raise ValueError("clustering method %s unknown" % method) if np.min(clusters_) == 0: clusters_ += 1 clusters_ += prev_max_clust clusters[idxs] = clusters_ prev_max_clust = max(clusters_) else: # connected component contains just 1 element prev_max_clust += 1 clusters[idxs] = prev_max_clust return np.array(extra.flatten(clusters))
def score_sam(min_val, max_val, incr=1): sam_range = range(min_val, max_val, incr) scores = [] for k in sam_range: db = DBSCAN(eps=2, min_samples=k) db.fit(X_scaled) if len(set(db.labels_)) > 1: scores.append(metrics.silhouette_score(X_scaled, db.labels_)) else: scores.append(0) return scores
def simple_clustering(x): print alt("Current parameters: %s" %(str(x))) dbscan = DBSCAN(eps=x[0], min_samples=x[1], p=x[2]) dbscan.fit(X) cluster_sizes = get_cluster_size(dbscan.labels_) print alt("Current cluster sizes: %s" %(cluster_sizes)) sscore = silhouette_score(X, dbscan.labels_) tscore = (sscore / (len(cluster_sizes.keys()) - 1)) print alt("Current value of objective function: %.5f" %(tscore)) print "-" * 50 return -1.0 * tscore
def dbscan(self): """ Use DBSCAN to perform clustering in this chunk """ # Set up DBSCAN db = DBSCAN(eps=self.neighborhood, min_samples=self.min_members) # Perform the clustering db.fit(self.galaxy_coordinates) # save the labels np.savetxt(self.clusterfile,db.labels_,fmt='%d')
def cluster_DB(classif_data, vect_data): db = DBSCAN() np_arr_train = np.array(vect_data["train_vect"]) np_arr_label = np.array(classif_data["topics"]) np_arr_test = np.array(vect_data["test_vect"]) print "DB" db.fit(np_arr_train) sil_score = metrics.silhouette_score(np_arr_train, db.labels_, metric='euclidean') print sil_score return db.labels_
class ClusterAnalysis: ''' To get results c.image_clusters(areas=True, only_synapses=True) ''' DBSCAN_EPS = 6 DB_SCAN_MIN_SAMPLES = 4 MIN_CLUSTER_MEMBERS = 25 def __init__(self, ves, mem, syn): self.ves = to_bool_arr(ves) self.mem = to_bool_arr(mem) self.syn = to_bool_arr(syn) self.psyn = self.syn #& self.mem self.nsyn = ~self.syn & self.mem self.psynpoints = np.asarray(zip(*self.psyn.nonzero())) self.db = DBSCAN(eps=self.DBSCAN_EPS, min_samples=self.DB_SCAN_MIN_SAMPLES) self.db.fit(self.psynpoints) self.clusters = self._get_clusters() def _get_clusters(self): clus = {} labels = self.db.labels_ for label in set(labels): if label==-1: continue members = self.psynpoints[labels==label] if len(members)<self.MIN_CLUSTER_MEMBERS: continue clus[len(clus)] = Cluster(members, len(clus), self) return clus def image_clusters(self, show_only_one=None, areas=False, background=None, only_synapses=False): '''Background has to be a PIL.Image. Returns Image''' m = np.zeros_like(self.psyn) for i, c in self.clusters.items(): if show_only_one is not None and i!=show_only_one: continue if only_synapses and not c.synapse: continue c = c.members if not areas else c.area m[c[:,0], c[:,1]] = 1 if not background: return Image.fromarray((m*255).astype(np.uint8)) else: background = background.convert('RGB') bg = background.load() for p in zip(*m.T.nonzero()): bg[p] = (255,0,0) return background
def main(args): linesFile = sys.stdin if len(args) > 1: linesFile = open(args[1], 'r') allFeatures = [] allFilenames = [] filename = linesFile.readline() while len(filename) > 0: dataStr = linesFile.readline() features = np.fromstring(dataStr, dtype=int, sep=' ') features = features[:len(features)//2] allFeatures.append(features) allFilenames.append(filename.strip()) filename = linesFile.readline() print 'finished reading all filenames. clustering...' dbscan = DBSCAN(eps=1100, min_samples=2, random_state=np.random.RandomState(0)) dbscan.fit(np.atleast_2d(allFeatures)) print 'num clusters', len(set(dbscan.labels_)) homeDir = os.path.expanduser('~') groupsDir = os.path.join(homeDir, 'groups') templatesDir = os.path.join(groupsDir, 'templates') shutil.rmtree(groupsDir) os.mkdir(groupsDir) os.mkdir(templatesDir) for label, filename in zip(dbscan.labels_, allFilenames): label = str(int(label)) groupFolder = os.path.join(groupsDir, label) isFirstInstance = not os.path.isdir(groupFolder) if isFirstInstance: os.mkdir(groupFolder) originalImage = cv2.imread(os.path.join("/Users/huipeng/EO990RW8/", filename), 0) height, width = originalImage.shape resizedImage = cv2.resize(originalImage, (width/4, height/4)) newFilename = os.path.join(groupFolder, filename + '.png') cv2.imwrite(newFilename, resizedImage) if isFirstInstance and label != '-1': newFilename = os.path.join(templatesDir, label + '_' + filename + '.png') cv2.imwrite(newFilename, resizedImage) print 'finished'
def _aglom_cluster(self): # https://github.com/overlap-ai/words2map/blob/master/words2map.py print ('_aglom_cluster') size = self.opts['size'] * self.opts['size'] print (size) #cluster = AgglomerativeClustering(n_clusters=size) #cluster = Birch(n_clusters=size) cluster = DBSCAN(eps=0.3, min_samples=10) #X = self.X[:10000] #cluster.fit(X) cluster.fit(self.X) return cluster.labels_
class MinHashDBSCAN(): def __init__(self, eps=0.5, min_samples=5, algorithm='auto', leaf_size=30, p=None, random_state=None, fast=False, n_neighbors=5, radius=1.0, number_of_hash_functions=400, max_bin_size = 50, minimal_blocks_in_common = 1, shingle_size = 4, excess_factor = 5, number_of_cores=None, chunk_size=None): self.eps = eps self.min_samples = min_samples # self.metric = metric self.algorithm = algorithm self.leaf_size = leaf_size self.p = p self.random_state = random_state self.radius = radius self.fast = fast self.number_of_hash_functions = number_of_hash_functions self.max_bin_size = max_bin_size self.minimal_blocks_in_common = minimal_blocks_in_common self.shingle_size = shingle_size self.excess_factor = excess_factor self.number_of_cores = number_of_cores self.chunk_size = chunk_size self.n_neighbors = n_neighbors self._dbscan = DBSCAN(eps=self.eps, min_samples=min_samples, metric='precomputed', algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p, random_state=self.random_state) self.labels_ = None # only for compatible issues def fit(self, X, y=None): minHashNeighbors = MinHash(n_neighbors = self.n_neighbors, radius = self.radius, fast = self.fast, number_of_hash_functions = self.number_of_hash_functions, max_bin_size = self.max_bin_size, minimal_blocks_in_common = self.minimal_blocks_in_common, shingle_size = self.shingle_size, excess_factor = self.excess_factor, number_of_cores = self.number_of_cores, chunk_size = self.chunk_size, similarity=False) minHashNeighbors.fit(X, y) graph_result = minHashNeighbors.kneighbors_graph(mode='distance') self._dbscan.fit(graph_result) self.labels_ = self._dbscan.labels_ def fit_predict(self, X, y=None): self.fit(X, y) return self.labels_
def find_example_points(self): """Finds examplar data points for each cluster""" # Train DBSCAN dbscan = DBSCAN() dbscan.fit(self.IMAGE_STORE) labels = dbscan.labels_ unique, indices = np.unique(labels, return_index=True) # Remove the 'noise' example data point index = np.where(unique == -1) unique = np.delete(unique, index) indices = np.delete(indices, index) return np.vstack(unique, self.IMAGE_STORE[indices])
def dbscan_outliers(data, genes, eps, min_samples, max_samples=1, as_json=True): db = DBSCAN(eps=eps, min_samples=min_samples) # sd_scaler = StandardScaler() res = dr.get_dataset_ensembl_info() outliers_id = [] for g in genes: # scaled = sd_scaler.fit(data.loc[g, :]) fit = db.fit(np.reshape(data.loc[g, :], (196, 1))) candidates = itemfreq(fit.labels_) try: class_zero = candidates[0][1] class_one = candidates[1][1] support = min(class_one, class_zero) if min_samples < support <= max_samples: info = [gene for gene in res if gene.ensemblgeneid == g][0] formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support), "distance": "NA"} jinfo = json.dumps(formatted_info) jinfo += "," outliers_id.append(g) print("outlier found :" + g) if as_json: yield (jinfo) else: yield (formatted_info) except: pass
def dbscan_outliers(df): """ Find outliers (noise points) using DBSCAN. Parameters ---------- df: A pandas.DataFrame Returns ------- A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame) """ scaler = StandardScaler() scaler.fit(df) scaled = scaler.transform(df) dbs = DBSCAN() db = dbs.fit(scaled) outliers = dbs.fit_predict(scaled) df_o = df.ix[np.nonzero(outliers)] return db, df_o
def cluster_tweets(tweets): #TODO get TFIDF vector #do clustering ner_tags = [get_ner_tags(tweet).tolist() for tweet in tweets['tweet']] vectorizer = TfidfVectorizer(preprocessor=_dummy_preprocess, tokenizer=lambda x:x, binary=True, min_df=0, use_idf=True, smooth_idf=True) tfidf = vectorizer.fit_transform(ner_tags) #ner_tags = [get_ner_tags(tweet) for tweet in tweets['tweet']] print "clustering started" t0 = time() #cluster = AgglomerativeClustering(n_clusters=3, affinity="cosine" ) #cluster = MiniBatchKMeans(n_clusters=10, max_iter=100, batch_size=100) #metric=sklearn.metrics.pairwise.cosine_distances cluster = DBSCAN(min_samples=2, eps=0.5) clustered = cluster.fit(tfidf.todense()) #clustered = cluster.fit(ner_tags) labels = clustered.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print "clustering finished in %.3f seconds"%(time()-t0) print "%d clusters detected"%n_clusters_ tweets['cluster'] = labels tweets['ner'] = ner_tags return tweets
def cluster_points(XY): """Find clusters of points in XY and return a list of the indices of the points in each cluster. """ # use a density based sort to separate points into distinct # clusters, each one corresponding to a distinct root. # There is an edge case here in that roots can become # degenerate: as the roots come closer together, they will # be treated as a single root cluster at some non-zero # separation distance. db = DBSCAN(eps=r, min_samples=2) db.fit(XY) max_label = int(db.labels_.max()) labels = range(max_label + 1) where_label = [np.where(db.labels_ == label) for label in labels] return where_label
def execute(self,data): dbsc=DBSCAN(eps=2.828,min_samples=2) #print data dbsc.fit(data) clusters= dbsc.labels_ #print clusters clustering=get_clust_dict(clusters,data) #print clustering het=mean_inter_het(clustering) hom=mean_intra_hom(clustering) op = pd.DataFrame(columns=['het','hom']) op.het = [het] op.hom= [hom] op.to_csv('stat.csv', sep=',', encoding='utf-8') return data
def update_location_centroid(point, cluster, max_distance, min_samples): """ Updates the centroid of a location cluster with another point Args: point (:obj:`Point`): Point to add to the cluster cluster (:obj:`list` of :obj:`Point`): Location cluster max_distance (float): Max neighbour distance min_samples (int): Minimum number of samples Returns: (:obj:`Point`, :obj:`list` of :obj:`Point`): Tuple with the location centroid and new point cluster (given cluster + given point) """ cluster.append(point) points = [p.gen2arr() for p in cluster] # Estimates the epsilon eps = estimate_meters_to_deg(max_distance, precision=6) p_cluster = DBSCAN(eps=eps, min_samples=min_samples) p_cluster.fit(points) clusters = {} for i, label in enumerate(p_cluster.labels_): if label in clusters.keys(): clusters[label].append(points[i]) else: clusters[label] = [points[i]] centroids = [] biggest_centroid_l = -float("inf") biggest_centroid = None for label, n_cluster in clusters.items(): centroid = compute_centroid(n_cluster) centroids.append(centroid) if label >= 0 and len(n_cluster) >= biggest_centroid_l: biggest_centroid_l = len(n_cluster) biggest_centroid = centroid if biggest_centroid is None: biggest_centroid = compute_centroid(points) return biggest_centroid, cluster
def FindClusters(cfg, cfg_old, eps=1.5, min_samples=6, periodic=False, box=[]): """ Find density clusters in the """ if(not periodic): clf = DBSCAN(eps=eps, min_samples=min_samples) else: myMetric = metric.PeriodicMetric(box) algo = 'brute' clf = DBSCAN(eps=eps, min_samples=min_samples, algorithm=algo, metric=myMetric.Distance) clf.fit(cfg) # Plot clusters. mySet = set(clf.labels_) print(mySet) PlotClusters(cfg, cfg_old, clf.labels_, box)
import pandas as pd # Importing the dataset from sklearn.datasets import load_iris iris = load_iris() from sklearn.cluster import DBSCAN dbscan = DBSCAN() print dbscan """ DBSCAN(eps=0.5, metric='euclidean', min_samples=5, random_state=111) """ dbscan.fit(iris.data) dbscan.labels_ # Visualising the clusters # as data is in 3d space, we need to apply PCA for 2d ploting from sklearn.decomposition import PCA pca = PCA(n_components=2).fit(iris.data) pca_2d = pca.transform(iris.data) explained_variance = pca.explained_variance_ratio_ df_pca = pd.DataFrame(pca.components_, columns=iris.feature_names, index=['PC-1', 'PC-2']) """ #alternative way, fit_transform
row_ix = where(yhat == cluster) pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) pyplot.show() # k-means clustering from numpy import unique from sklearn.cluster import KMeans X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4) model = KMeans(n_clusters=2) model.fit(X) yhat = model.predict(X) clusters = unique(yhat) for cluster in clusters: row_ix = where(yhat == cluster) pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) pyplot.show() # gaussian mixture clustering from numpy import unique from sklearn.mixture import GaussianMixture X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0,
def findLocalExtrema(da, highVal=0, lowVal=1000, eType='Low'): """ Utility function to find local low/high field variable coordinates on a contour map. To classify as a local high, the data point must be greater than highVal, and to classify as a local low, the data point must be less than lowVal. Args: da: (:class:`xarray.DataArray`): Xarray data array containing the lat, lon, and field variable (ex. pressure) data values highVal (:class:`int`): Data value that the local high must be greater than to qualify as a "local high" location. Default highVal is 0. lowVal (:class:`int`): Data value that the local low must be less than to qualify as a "local low" location. Default lowVal is 1000. eType (:class:`str`): 'Low' or 'High' Determines which extrema are being found- minimum or maximum, respectively. Default eType is 'Low'. Returns: clusterExtremas (:class:`list`): List of coordinate tuples in GPS form (lon in degrees, lat in degrees) that specify local low/high locations """ # Create a 2D array of coordinates in the same shape as the field variable data # so each coordinate is easily mappable to a data value # ex: # (1, 1), (2, 1), (3, 1) # (1, 2)................ # (1, 3)................ lons, lats = np.meshgrid(np.array(da.lon), np.array(da.lat)) coordarr = np.dstack((lons, lats)) # Find all zeroes that also qualify as low or high values extremacoords = [] if eType == 'Low': coordlist = np.argwhere(da.data < lowVal) extremacoords = [tuple(coordarr[x[0]][x[1]]) for x in coordlist] if eType == 'High': coordlist = np.argwhere(da.data > highVal) extremacoords = [tuple(coordarr[x[0]][x[1]]) for x in coordlist] if extremacoords == []: if eType == 'Low': warnings.warn( 'No local extrema with data value less than given lowVal') return [] if eType == 'High': warnings.warn( 'No local extrema with data value greater than given highVal') return [] # Clean up noisy data to find actual extrema # Use Density-based spatial clustering of applications with noise # to cluster and label coordinates db = DBSCAN(eps=10, min_samples=1) new = db.fit(extremacoords) labels = new.labels_ # Create an dictionary of values with key being coordinate # and value being cluster label. coordsAndLabels = {label: [] for label in labels} for label, coord in zip(labels, extremacoords): coordsAndLabels[label].append(coord) # Initialize array of coordinates to be returned clusterExtremas = [] # Iterate through the coordinates in each cluster for key in coordsAndLabels: # Create array to hold all the field variable values for that cluster datavals = [] for coord in coordsAndLabels[key]: # Find pressure data at that coordinate cond = np.logical_and(coordarr[:, :, 0] == coord[0], coordarr[:, :, 1] == coord[1]) x, y = np.where(cond) datavals.append(da.data[x[0]][y[0]]) # Find the index of the smallest/greatest field variable value of each cluster if eType == 'Low': index = np.argmin(np.array(datavals)) if eType == 'High': index = np.argmax(np.array(datavals)) # Append the coordinate corresponding to that index to the array to be returned clusterExtremas.append( (coordsAndLabels[key][index][0], coordsAndLabels[key][index][1])) return clusterExtremas
'encrypted_5_zipcode', 'produtos_vendidos', 'transacoes_total', 'faturamento_total' ]].dropna() #minmax subset = (subset - subset.min()) / (subset.max() - subset.min()) #zscore #subset = (subset - subset.mean()) / subset.std() X = np.column_stack([ subset.encrypted_5_zipcode, subset.produtos_vendidos, subset.transacoes_total, subset.faturamento_total ]) clustering = DBSCAN(metric='euclidean', eps=0.3) clustering.fit(X) metrics.silhouette_score(X, clustering.labels_, metric='euclidean') pca = PCA(n_components=2) reduced = pca.fit_transform(X) plt.scatter(reduced[:, 0], reduced[:, 1], c=clustering.labels_) sazon_set = lojas_df[[ 'encrypted_5_zipcode', 'periodo_0', 'periodo_1', 'periodo_2', 'periodo_3', 'periodo_4' ]].dropna() sazon_set = (sazon_set - sazon_set.min()) / (sazon_set.max() - sazon_set.min()) X = np.column_stack([ sazon_set.encrypted_5_zipcode, sazon_set.periodo_0, sazon_set.periodo_1, sazon_set.periodo_2, sazon_set.periodo_3, sazon_set.periodo_4
""" from sklearn.datasets import make_blobs from sklearn.cluster import DBSCAN from matplotlib import pyplot as plt from sklearn.decomposition import PCA import numpy as np X, ytrue = make_blobs(n_samples=1000, n_features=2, cluster_std=0.01, random_state=0) # plt.scatter(X[:, 0], X[:, 1], c = y) miModelo = DBSCAN(eps=0.5, min_samples=5) miModelo.fit(X) clusters = miModelo.labels_ plt.figure() plt.subplot(1, 2, 1) plt.scatter(X[:, 0], X[:, 1], c=ytrue, s=150) plt.title('Color = ytrue') plt.subplot(1, 2, 2) plt.scatter(X[:, 0], X[:, 1], c=clusters, s=150) plt.title('Color = clusters') from sklearn.metrics import silhouette_score sc = silhouette_score(X, clusters)
cv.drawContours(img_color, [cnt], 0, (0, 0, 0), 2) cv.imshow("result", img_color) cv.waitKey(0) x = np.array(contours2) contours22 = np.vstack(contours2).squeeze() #print(contours22) height = img_color.shape[0] width = img_color.shape[1] channels = img_color.shape[2] radius = ((height / 8) + (width / 8)) / 2 df = pd.DataFrame(contours22) model = DBSCAN(eps=radius, min_samples=3) model.fit(df) y_predict = model.fit_predict(df) print(y_predict) df[2] = y_predict #print(df) df = df[df[2] != -1] del df[2] print(df) tuples = [tuple(x) for x in df.values] #print(tuples) for cnt in tuples: cv.circle(img_color, cnt, 0, (255, 255, 0), 2) cv.imshow("result2", img_color)
def process_dbscan_jaccard(self): dbscan = DBSCAN(algorithm='ball_tree', metric='haversine') dbscan.fit(self.sparse_matrix) #Get the shape of Sparse matrix row, col = self.sparse_matrix.get_shape() #Calculate average point position for each cluster cluster = {} clusetr_amount = {} for i in range(0, row): if dbscan.labels_[i] != -1: if dbscan.labels_[i] in clusetr_amount: clusetr_amount[dbscan.labels_[i]] += 1 else: clusetr_amount[dbscan.labels_[i]] = 1 for j in range(0, col): if self.sparse_matrix[i, j] == 1: if dbscan.labels_[i] in cluster: cluster[dbscan.labels_[i]][j] += 1 else: cluster[dbscan.labels_[i]] = numpy.zeros(col) cluster[dbscan.labels_[i]][j] = 1 for key in cluster: cluster[key] = cluster[key] / clusetr_amount[key] minumum_distance = {} for key in cluster: minumum_distance[key] = 999 #Find the nearest row by euclidean to average point position as centroid for each cluster array_row = numpy.zeros(col) centroid = {} for i in range(0, row): if dbscan.labels_[i] != -1: for j in range(0, col): if self.sparse_matrix[i, j] == 1: array_row[j] = 1 eu_dist = distance.euclidean(array_row, cluster[dbscan.labels_[i]]) if eu_dist < minumum_distance[dbscan.labels_[i]]: minumum_distance[dbscan.labels_[i]] = eu_dist centroid[dbscan.labels_[i]] = i array_row = numpy.zeros(col) centroid_matrix = numpy.zeros((len(centroid), col)) for i in range(0, len(centroid_matrix)): for j in range(0, col): centroid_matrix[i][j] = self.sparse_matrix[centroid[i], j] #Find overall jaccard score by centroid array_row = numpy.zeros(col) overall_distance = 0 for i in range(0, row): if dbscan.labels_[i] != -1: for j in range(0, col): if self.sparse_matrix[i, j] == 1: array_row[j] = 1 ja_distance = jaccard_score(centroid_matrix[dbscan.labels_[i]], array_row) overall_distance = overall_distance + ja_distance array_row = numpy.zeros(col) self.fout.write("DBSCAN overall distance:") self.fout.write("\n") self.fout.write(str(overall_distance)) self.fout.write("\n") self.fout.flush() return dbscan.labels_
def cluster(num_samples, num_clusters): x = 3.3 * np.random.randn(num_samples, 2) X = StandardScaler().fit_transform(x) flag = False if flag: t0 = time.time() km = MiniBatchKMeans(init='k-means++', n_clusters=num_clusters, batch_size=3 * num_clusters, max_no_improvement=10, verbose=0, max_iter=100, random_state=0) km.fit(X) t1 = time.time() print 'kmeans time taken : ', t1 - t0 flag = True if flag: t0 = time.time() bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=10000) # bandwidth = 0.5 print bandwidth print 'bandwidth estimation time : ', time.time() - t0 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=100, n_jobs=-1) ms.fit(X) t1 = time.time() labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) print 'meanshift time taken : ', t1 - t0 flag = False if flag: x = 3.3 * np.random.randn(num_samples, 2) X = StandardScaler().fit_transform(x) t0 = time.time() db = DBSCAN(eps=0.3, min_samples=100) db.fit(X) t1 = time.time() n_clusters_ = len(set(db.labels_)) - (1 if -1 in db.labels_ else 0) print 'number of clusters:', n_clusters_ print 'DBSCAN time taken : ', t1 - t0 flag = False if flag: t0 = time.time() bm = Birch(threshold=0.3, n_clusters=None) bm.fit(X) t1 = time.time() print 'number of clusters:', np.unique(bm.labels_).size print 'Birch time taken : ', t1 - t0
######################################################################################################################## eps = 0.3 ms = 20 ''' eps: DBSCAN算法参数,即我们的ϵϵ-邻域的距离阈值,和样本距离超过ϵϵ的样本点不在ϵϵ-邻域内。 默认值是0.5.一般需要通过在多组值里面选择一个合适的阈值。 eps过大,则更多的点会落在核心对象的ϵϵ-邻域,此时我们的类别数可能会减少,本来不应该是一类的样本也会被划为一类。 反之则类别数可能会增大,本来是一类的样本却被划分开。 min_samples: DBSCAN算法参数,即样本点要成为核心对象所需要的ϵϵ-邻域的样本数阈值。 默认值是5. 一般需要通过在多组值里面选择一个合适的阈值。通常和eps一起调参。 在eps一定的情况下,min_samples过大,则核心对象会过少,此时簇内部分本来是一类的样本可能会被标为噪音点, 类别数也会变多。反之min_samples过小的话,则会产生大量的核心对象,可能会导致类别数过少。 ''' dbscan = DBSCAN(eps=eps, min_samples=ms) dbscan.fit(output3_value_embedded) label_pred = dbscan.labels_ # while(label_pred.max()!=4): # print(label_pred.max()) # if label_pred.max()>4: # eps=eps*1.05 # ms=ms+1 # print(eps,ms) # dbscan = DBSCAN(eps=eps, min_samples=ms) # dbscan.fit(output3_value_embedded) # label_pred = dbscan.labels_ # if label_pred.max()<4: # eps = eps * 0.95 # if ms>2: # ms=ms-1 # print(eps,ms)
from sklearn.preprocessing import scale from pandas.tools.plotting import scatter_matrix dim_reduction=PCA() Xc=dim_reduction.fit_transform(scale(X)) print 'variance explained by first 2 components %0.1f'%(sum(dim_reduction.explained_variance_ratio_[-2:]* 100)) df = pd.DataFrame(Xc, columns =['comp_' + str(j+1) for j in range(10)]) first_two = df.plot(kind ='scatter', x ='comp_1', y ='comp_2', c ='DarkGray', s = 50) last_two = df.plot( kind ='scatter', x ='comp_9', y ='comp_10', c ='DarkGray', s = 50) outlying=(Xc[:,-1] < -0.3) | (Xc[:,-2] < -1.0) print df[outlying] #Print outliers found out by PCA #**************************Using Cluster Analysis*************************** from sklearn.cluster import DBSCAN DB=DBSCAN(eps=2.5, min_samples=25, random_state=101) DB.fit(Xc) from collections import Counter print Counter (DB.labels_),'\n' print df[DB.labels_==-1] Counter({0:414,-1:28}) #*************************Using OneClassSVM****************************** from sklearn import svm outliers_fraction =0.01 nu_estimate=0.95*outliers_fraction+0.05 auto_detection=svm.OneClassSVM(kernel="rbf", gamma=0.01,degree=3,nu=nu_estimate) auto_detection.fit(Xc) evaluation=auto_detection.predict(Xc) print df[evaluation==-1]
class DataCleaner(): """ Class for outlier detection and data cleaning in preprocessing Implements sklearn.cluster.DBSCAN for compositional clustering, and sklearn.ensemble.IsolationForest for greedy outlier flagging. Applies z-score threshold within composition clusters to screen IsolationForest flags. Parameters ---------- data: dataset to process (pandas DataFrame) prop_dim: property dimension to screen for outliers comp_dims: composition dimensions for clustering and IsolationForest add_fit_dims: additional dimensions to use for outlier identification cluster_by: column to group by for clustering. If None, use DBSCAN to identify clusters DB_kw: kwargs to pass to DBSCAN instantiation IF_kw: kwargs to pass to IsolationForest instantiation """ def __init__(self, data, prop_dim, comp_dims=None, add_fit_dims=[], cluster_by=None, DB_kw={}, IF_kw={}): self.data = data self.set_prop_dim(prop_dim) self.set_comp_dims(comp_dims) self.add_fit_dims = add_fit_dims self.cluster_by = cluster_by self.random_state = np.random.RandomState(17) self.db = DBSCAN(**DB_kw) self.clf = IsolationForest(random_state=self.random_state, **IF_kw) def set_prop_dim(self, prop_dim): "set property dimension" self._prop_dim = prop_dim def get_prop_dim(self): "get property dimension" return self._prop_dim prop_dim = property(get_prop_dim, set_prop_dim) def set_comp_dims(self, comp_dims=None): """ set composition dimensions used for clustering. Defaults to all valid elemental symbols in data columns Parameters ---------- comp_dims: list of columns in data to use as composition dimensions """ #if no comp dims specified, use all columns that are valid element symbols if comp_dims == None: comp_dims = [] for col in self.data.columns: try: mg.Element(col) comp_dims.append(col) except ValueError: pass self._comp_dims = comp_dims def get_comp_dims(self): "get composition dimensions" return self._comp_dims comp_dims = property(get_comp_dims, set_comp_dims) @property def comp_data(self): "composition data" return self.data[self.comp_dims] @property def fit_dims(self): "dimensions used for identifying outliers" return self.comp_dims + self.add_fit_dims + [self.prop_dim] def fit_data(self, comp_scale=1, prop_scale=1, add_fit_scale=1, cluster_by=None): "data used for identifying outliers" fit_data = self.data.copy() fit_data[self.comp_dims] = self.scaled_comp_data( scale=comp_scale, cluster_by=cluster_by).values ss = StandardScaler() if cluster_by is None: fit_data[self.prop_dim] = prop_scale * ss.fit_transform( fit_data[self.prop_dim].values[:, None]) if len(self.add_fit_dims) > 0: fit_data[self.add_fit_dims] = add_fit_scale * ss.fit_transform( fit_data[self.add_fit_dims]) else: # scale within each cluster gdf = fit_data.groupby(cluster_by) for cluster, idx in gdf.groups.items(): cdata = fit_data.loc[idx, :] fit_data.loc[idx, self.prop_dim] = prop_scale * ss.fit_transform( cdata[self.prop_dim].values[:, None]) if len(self.add_fit_dims) > 0: fit_data.loc[ idx, self.add_fit_dims] = add_fit_scale * ss.fit_transform( cdata[self.add_fit_dims]) return fit_data[self.fit_dims] def scaled_comp_data(self, scale=1, cluster_by=None): """ scale composition dimensions such that largest-variance dimension has variance max_var """ ss = StandardScaler() if cluster_by is None: #get dimension with largest variance ref_dim = np.var(self.comp_data).idxmax() ss.fit(self.comp_data[ref_dim].values[:, None]) #scale all comp dims with same scaler such that refdim has variance max_var scaled_comp_data = pd.DataFrame(scale * ss.transform(self.comp_data), columns=self.comp_dims) else: # scale within each cluster gdf = self.data.groupby(cluster_by) scaled_comp_data = self.comp_data.copy() for cluster, idx in gdf.groups.items(): cdata = self.comp_data.loc[idx, :] #get dimension with largest variance ref_dim = np.var(cdata).idxmax() ss.fit(cdata[ref_dim].values[:, None]) #scale all comp dims with same scaler such that refdim has variance max_var scaled_comp_data.loc[idx, :] = scale * ss.transform(cdata) return scaled_comp_data def fit(self, method, comp_scale=1, prop_scale=1): """ fit DBSCAN and IsolationForest to data Parameters ---------- comp_scale: maximum compositional variance set by scale_composition """ if method == 'DBIFZ': if self.cluster_by is None: # fit DBSCAN to comp data for compositional clustering self.db.fit(self.scaled_comp_data(scale=comp_scale)) # fit IsolationForest to iso data for greedy outlier flagging self.clf.fit(self.fit_data(comp_scale, prop_scale)) elif method == 'DBSCAN': # nothing to do yet pass def predict(self, method, comp_scale=1, prop_scale=1, add_fit_scale=1, z_thresh=2): """ predict outliers in data Parameters ---------- z_thresh: z-score threshold for intra-cluster outlier identification """ self.pred = pd.DataFrame() self.pred[self.prop_dim] = self.data[self.prop_dim] self.z_thresh = z_thresh if self.cluster_by is not None: # use provided column to group into clusters self.pred.loc[:, 'cluster_name'] = self.data[self.cluster_by] cluster_names = self.pred['cluster_name'].unique() clusters = np.arange(len(cluster_names)) cluster_dict = dict(zip(cluster_names, clusters)) self.pred['cluster'] = self.pred['cluster_name'].map( lambda x: cluster_dict[x]) self.cluster_name = dict(zip(clusters, cluster_names)) if method == 'DBIFZ': if self.cluster_by is None: # use DBSCAN to cluster by composition self.pred.loc[:, 'cluster'] = self.db.fit_predict( self.scaled_comp_data( comp_scale)) #db has no pure predict function self.pred['cluster_name'] = self.pred['cluster'] clusters = self.pred['cluster'].unique() self.cluster_name = dict(zip(clusters, clusters)) fit_data = self.fit_data(comp_scale, prop_scale, add_fit_scale) self.pred.loc[:, 'isolation_flag'] = self.clf.predict(fit_data) self.pred.loc[:, 'isolation_score'] = self.clf.decision_function( fit_data) #get z-scores for each cluster and cross-ref with isolation forest for i, cluster in enumerate(self.pred['cluster'].unique()): df = self.pred.loc[self.pred['cluster'] == cluster, :] self.pred.loc[self.pred['cluster'] == cluster, 'cluster_zscore'] = z_score(df[self.prop_dim]) #set final outlier flag - if flagged by isolation forest and cluster z-score is outside z_thresh self.pred.loc[:, 'outlier_flag'] = np.where( (self.pred['isolation_flag'] == -1) & (np.abs(self.pred['cluster_zscore']) > z_thresh), -1, 0) elif method == 'DBSCAN': if self.cluster_by is None: # apply DBSCAN to comp dims and prop dim to cluster and identify outliers fit_data = self.fit_data(comp_scale, prop_scale, add_fit_scale) self.pred.loc[:, 'cluster'] = self.db.fit_predict( fit_data) #db has no pure predict function self.pred['cluster_name'] = self.pred['cluster'] clusters = self.pred['cluster'].unique() self.cluster_name = dict(zip(clusters, clusters)) # cluster -1 is outliers self.pred['outlier_flag'] = self.pred['cluster'].map( lambda x: -1 if x == -1 else 0) else: # apply DBSCAN within each provided cluster to identify outliers fit_data = self.fit_data(comp_scale, prop_scale, add_fit_scale, cluster_by=self.cluster_by) for cluster, idx in self.data.groupby( self.cluster_by).groups.items(): cdata = fit_data.loc[idx, :] self.pred.loc[idx, 'DB_cluster'] = self.db.fit_predict(cdata) # cluster -1 is outliers self.pred['outlier_flag'] = self.pred['DB_cluster'].map( lambda x: -1 if x == -1 else 0) #get z-scores for each cluster for i, cluster in enumerate(self.pred['cluster'].unique()): df = self.pred.loc[self.pred['cluster'] == cluster, :] self.pred.loc[self.pred['cluster'] == cluster, 'cluster_zscore'] = z_score(df[self.prop_dim]) # set IF columns for compatibility self.pred.loc[:, 'isolation_flag'] = 1 self.pred.loc[:, 'isolation_score'] = 0 #include scaled fit_data in pred for col in fit_data.columns: self.pred[f'{col}_fit'] = fit_data[col] #return self.pred def fit_predict(self, method, comp_scale=1, prop_scale=1, add_fit_scale=1, z_thresh=2): """combine fit and predict functions""" self.fit(method, comp_scale, prop_scale) self.predict(method, comp_scale, prop_scale, add_fit_scale, z_thresh) #return self.pred def remove_outliers(self): """remove outliers identified by fit_predict""" self.clean_data = self.data[self.pred['outlier_flag'] != -1] #return self.clean_data @property def data_pred(self): "data joined with prediction results" return self.data.join(self.pred.drop(labels=self.prop_dim, axis=1)) @property def outliers(self): "outlier data rows" return self.data_pred[self.data_pred['outlier_flag'] == -1] @property def inliers(self): "inlier data rows" return self.data_pred[self.data_pred['outlier_flag'] != -1] def set_DB_params(self, **params): """set DBSCAN parameters""" self.db.set_params(**params) def set_IF_params(self, **params): """set IsolationForest parameters""" self.clf.set_params(**params) def scatter_slices(self, slice_axis, slice_starts, slice_widths, tern_axes, color_col=None, vmin=None, vmax=None, cmap=plt.cm.viridis, data_filter=None, **scatter_kwargs): if color_col is None: color_col = self.prop_dim if data_filter is not None: data = data_filter(self.data_pred) else: data = self.data_pred #get vmin and vmax if vmin is None: vmin = data[color_col].min() if vmax is None: vmax = data[color_col].max() #plot all axes = scatter_slices(data, color_col, slice_axis, slice_starts, slice_widths, tern_axes, cmap=cmap, vmin=vmin, vmax=vmax, **scatter_kwargs) def scatter_slice_highlight(self, slice_axis, slice_starts, slice_widths, tern_axes, color_col=None, vmin=None, vmax=None, cmap=plt.cm.viridis, data_filter=None, **scatter_kwargs): """ plot all data points with outliers highlighted in red. color determined by value of prop_dim Parameters ---------- slice_axis: composition dimension on which to slice slice_starts: values of slice_axis at which to start slices slice_widths: widths of slices in slice_axis dimension. Single value or list tern_axes: composition dimensions for ternary plot axes (order: right, top, left) cmap: colormap for prop_dim values scatter_kwargs: kwargs to pass to helpers.plotting.scatter_slices """ if color_col is None: color_col = self.prop_dim if data_filter is not None: data = data_filter(self.data_pred) else: data = self.data_pred #get vmin and vmax if vmin is None: vmin = data[color_col].min() if vmax is None: vmax = data[color_col].max() inliers = data[data['outlier_flag'] == 0] outliers = data[data['outlier_flag'] == -1] #plot inliers axes = scatter_slices(inliers, color_col, slice_axis, slice_starts, slice_widths, tern_axes, cmap=cmap, vmin=vmin, vmax=vmax, **scatter_kwargs) #plot outliers scatter_slices(outliers, color_col, slice_axis, slice_starts, slice_widths, tern_axes, cmap=cmap, axes=axes, vmin=vmin, vmax=vmax, colorbar=False, s=20, marker='d', edgecolors='r', linewidths=0.8, **scatter_kwargs) def scatter_slice_clusters(self, slice_axis, slice_starts, slice_widths, tern_axes, cmap=plt.cm.plasma, **scatter_kwargs): """ plot all data points with cluster shown by color Parameters ---------- slice_axis: composition dimension on which to slice slice_starts: values of slice_axis at which to start slices slice_widths: widths of slices in slice_axis dimension. Single value or list tern_axes: composition dimensions for ternary plot axes (order: right, top, left) cmap: colormap for cluster values scatter_kwargs: kwargs to pass to helpers.plotting.scatter_slices """ #make norm for discrete colormap clusters = list(self.cluster_name.keys()) cluster_names = list(self.cluster_name.values()) n_clusters = len(self.pred['cluster'].unique()) bounds = np.arange(min(clusters) - 0.5, max(clusters) + 0.51) norm = mpl.colors.BoundaryNorm(bounds, cmap.N) scatter_slices(self.data_pred, 'cluster', slice_axis, slice_starts, slice_widths, tern_axes, cmap=cmap, norm=norm, cb_kwargs={ 'norm': norm, 'ticks': clusters, 'tickformat': '%.0f', 'ticklabels': cluster_names }, **scatter_kwargs) def scatter_slice_outliers(self, slice_axis, slice_starts, slice_widths, tern_axes, cmap=plt.cm.viridis, **scatter_kwargs): """ plot outliers only Parameters ---------- slice_axis: composition dimension on which to slice slice_starts: values of slice_axis at which to start slices slice_widths: widths of slices in slice_axis dimension. Single value or list tern_axes: composition dimensions for ternary plot axes (order: right, top, left) cmap: colormap for prop_dim values scatter_kwargs: kwargs to pass to helpers.plotting.scatter_slices """ axes = scatter_slices(self.outliers, self.prop_dim, slice_axis, slice_starts, slice_widths, tern_axes, cmap=cmap, **scatter_kwargs) return axes def scatter_slice_inliers(self, slice_axis, slice_starts, slice_widths, tern_axes, cmap=plt.cm.viridis, **scatter_kwargs): """ plot inliers only Parameters ---------- slice_axis: composition dimension on which to slice slice_starts: values of slice_axis at which to start slices slice_widths: widths of slices in slice_axis dimension. Single value or list tern_axes: composition dimensions for ternary plot axes (order: right, top, left) cmap: colormap for prop_dim values scatter_kwargs: kwargs to pass to helpers.plotting.scatter_slices """ axes = scatter_slices(self.inliers, self.prop_dim, slice_axis, slice_starts, slice_widths, tern_axes, cmap=cmap, **scatter_kwargs) return axes def cluster_hist(self, ncols=2, cluster_by=None): if cluster_by is None: clusters = list(self.cluster_name.keys()) else: gdf = self.data_pred.groupby(cluster_by) clusters = [k for k in gdf.groups.keys()] #print(clusters) nrows = int(np.ceil(len(clusters) / ncols)) #print(nrows) fig, axes = plt.subplots(nrows, ncols, figsize=(ncols * 4, nrows * 3)) for (i, cluster), ax in zip(enumerate(clusters), axes.ravel()): if cluster_by is None: df = self.data_pred.loc[self.data_pred['cluster'] == cluster, :] else: idx = gdf.groups[cluster] df = self.data_pred.loc[idx, :] num_outliers = len(df[df['isolation_flag'] == -1]) # try: #2d axes # ax = axes[int(i/ncols), i%ncols] # except IndexError: #1d axes # ax = axes[i] dfo = df[df['isolation_flag'] == -1] dfi = df[df['isolation_flag'] == 1] hist, bins = np.histogram(df['cluster_zscore']) if len(dfo) > 0: # if isolation forest outliers exist (method=DBIFZ) ax.hist([dfo['cluster_zscore'], dfi['cluster_zscore']], alpha=0.8, bins=bins, histtype='barstacked', label=[ 'IsolationForest outliers', 'IsolationForest inliers' ], color=['#ff7f0e', '#1f77b4']) ax.legend() else: # if no isolation forest outliers (method=DBSCAN) dfo = df[df['outlier_flag'] == -1] dfi = df[df['outlier_flag'] == 0] ax.hist([dfo['cluster_zscore'], dfi['cluster_zscore']], alpha=0.8, bins=bins, histtype='barstacked', label=['Outliers', 'Inliers'], color=['#ff7f0e', '#1f77b4']) ax.legend() if cluster_by is None: ax.set_title('Cluster {}'.format(self.cluster_name[cluster])) else: ax.set_title('Cluster {}'.format(cluster)) ax.set_xlabel('Cluster Z-score') ax.set_ylabel('Frequency') #plot z-score threshold ax.axvline(-self.z_thresh, ls='--', c='r') ax.axvline(self.z_thresh, ls='--', c='r') # add second axis to show prop_dim values ax2 = ax.twiny() ax2.set_xlim(ax.get_xlim()) ax2.set_xticks(ax.get_xticks()) tick_vals = df[self.prop_dim].mean( ) + df[self.prop_dim].std() * ax.get_xticks() ax2.set_xticklabels(np.round(tick_vals, 1)) ax2.set_xlabel(self.prop_dim) fig.tight_layout() def cluster_scatter(self, x_col, y_col, plot_combined=False, cluster_by=None, flag_outliers=False, ncols=2, s=8, data_filter=None, sharex=False, sharey=False, basefontsize=11, **scatter_kw): """ Scatter plot for each cluster. Args: x_col: x column y_col: y column plot_combined: if True, create an additional plot with all samples overlaid cluster_by: column to use for grouping. If None, use clusters assigned by fit_predict flag_outliers: if True, plot outliers in orange ncols: number of columns for subplot grid s: point size data_filter: function to filter data. Should apply to DataFrame and return filtered DataFrame. Ex: data_filter = lambda df: df[df['property']==value] sharex, sharey: kwargs for plt.subplots() scatter_kw: kw for plt.scatter() """ if data_filter is None: data = self.data_pred else: data = data_filter(self.data_pred) if cluster_by is None: clusters = list(self.cluster_name.keys()) else: gdf = data.groupby(cluster_by) clusters = [k for k in gdf.groups.keys()] if plot_combined: num_plots = len(clusters) + 1 else: num_plots = len(clusters) nrows = int(np.ceil(num_plots / ncols)) fig, axes = plt.subplots(nrows, ncols, figsize=(ncols * 4, nrows * 3), sharex=sharex, sharey=sharey) for (i, cluster), ax in zip(enumerate(clusters), axes.ravel()): if cluster_by is None: df = data.loc[self.data_pred['cluster'] == cluster, :] else: idx = gdf.groups[cluster] df = data.loc[idx, :] if flag_outliers is False: ax.scatter(df[x_col], df[y_col], s=s, **scatter_kw) else: dfi = df[df['outlier_flag'] == 0] dfo = df[df['outlier_flag'] == -1] ax.scatter(dfi[x_col], dfi[y_col], label='Inliers', s=s, **scatter_kw) ax.scatter(dfo[x_col], dfo[y_col], label='Outliers', s=s, **scatter_kw) ax.legend(fontsize=basefontsize) if cluster_by is None: ax.set_title('Cluster {}'.format(self.cluster_name[cluster]), fontsize=basefontsize + 1) else: ax.set_title('Cluster {}'.format(cluster), fontsize=basefontsize + 1) ax.set_xlabel(x_col, fontsize=basefontsize) ax.set_ylabel(y_col, fontsize=basefontsize) ax.tick_params(axis='both', which='major', labelsize=basefontsize - 1) if plot_combined: # plot all clusters on same axes if flag_outliers is False: axes.ravel()[-1].scatter(data[x_col], data[y_col], s=s, **scatter_kw) else: dfi = data[data['outlier_flag'] == 0] dfo = data[data['outlier_flag'] == -1] axes.ravel()[-1].scatter(dfi[x_col], dfi[y_col], label='Inliers', s=s, **scatter_kw) axes.ravel()[-1].scatter(dfo[x_col], dfo[y_col], label='Outliers', s=s, **scatter_kw) axes.ravel()[-1].legend(fontsize=basefontsize) axes.ravel()[-1].set_xlabel(x_col, fontsize=basefontsize) axes.ravel()[-1].set_ylabel(y_col, fontsize=basefontsize) axes.ravel()[-1].set_title('All Clusters', fontsize=basefontsize + 1) axes.ravel()[-1].tick_params(axis='both', which='major', labelsize=basefontsize - 1) for ax in axes.ravel()[num_plots:]: # turn off unused axes ax.axis('off') if sharex: for ax in axes[:-1, :]: ax.set_xlabel('') if sharey: for ax in axes[:, 1:].ravel(): ax.set_ylabel('') fig.tight_layout() def quat_plot(self, ax=None, figsize=(8, 6), quat_axes=['Co', 'Fe', 'Zr', 'Y'], label_kw={}, gridlines=True, color_col=None, colorbar=True, cb_kw={}, s=3, data_filter=None, **scatter_kw): qax = QuaternaryAxes(ax=ax, figsize=figsize) qax.draw_axes() # default corner label kwargs label_kwargs = dict(offset=0.11, size=14) # update with user kwargs label_kwargs.update(label_kw) qax.label_corners(quat_axes, **label_kwargs) if color_col is None: color_col = self.prop_dim # Default colorbar kwargs cb_kwargs = { 'label': color_col, 'cbrect': [0.8, 0.1, 0.02, 0.65], 'labelkwargs': { 'size': 14 }, 'tickparams': { 'labelsize': 13 } } # update with any user-specified kwargs cb_kwargs.update(cb_kw) if data_filter is not None: data = data_filter(self.data_pred) else: data = self.data_pred if 'vmin' not in scatter_kw.keys(): scatter_kw['vmin'] = data[color_col].min() if 'vmax' not in scatter_kw.keys(): scatter_kw['vmax'] = data[color_col].max() qax.scatter(data[quat_axes].values, c=data[color_col], s=s, colorbar=colorbar, cb_kwargs=cb_kwargs, **scatter_kw) qax.axes_ticks(size=13, corners='rbt', offset=0.08) if gridlines == True: qax.gridlines(ls=':', LW=0.6) qax.ax.axis('off') return qax def quat_highlight(self, ax=None, figsize=(8, 6), quat_axes=['Co', 'Fe', 'Zr', 'Y'], label_kw={}, gridlines=True, color_col=None, cb_label=None, data_filter=None, **scatter_kw): qax = QuaternaryAxes(ax=ax, figsize=figsize) qax.draw_axes() # default corner label kwargs label_kwargs = dict(offset=0.11, size=14) # update with user kwargs label_kwargs.update(label_kw) qax.label_corners(quat_axes, **label_kwargs) if color_col is None: color_col = self.prop_dim if cb_label is None: cb_label = color_col if data_filter is not None: data = data_filter(self.data_pred) else: data = self.data_pred if 'vmin' not in scatter_kw.keys(): scatter_kw['vmin'] = data[color_col].min() if 'vmax' not in scatter_kw.keys(): scatter_kw['vmax'] = data[color_col].max() inliers = data[data['outlier_flag'] == 0] outliers = data[data['outlier_flag'] == -1] qax.scatter(inliers[quat_axes].values, c=inliers[color_col], s=3, colorbar=True, cb_kwargs={ 'label': cb_label, 'cbrect': [0.8, 0.1, 0.02, 0.65], 'labelkwargs': { 'size': 14 }, 'tickparams': { 'labelsize': 13 } }, **scatter_kw) qax.scatter(outliers[quat_axes].values, c=outliers[color_col], s=6, edgecolors='r', linewidths=0.5, **scatter_kw) qax.axes_ticks() if gridlines == True: qax.gridlines() qax.ax.axis('off') return qax def quat_clusters(self, ax=None, figsize=(8, 6), quat_axes=['Co', 'Fe', 'Zr', 'Y'], label_kw={}, gridlines=True, cmap=plt.cm.plasma, s=3, colorbar=True, cb_kw={}, **scatter_kw): qax = QuaternaryAxes(ax=ax, figsize=figsize) qax.draw_axes() # default corner label kwargs label_kwargs = dict(offset=0.11, size=14) # update with user kwargs label_kwargs.update(label_kw) qax.label_corners(quat_axes, **label_kwargs) vmin = self.pred['cluster'].min() vmax = self.pred['cluster'].max() #make norm for discrete colormap clusters = list( self.cluster_name.keys()) #pred['cluster'].unique().astype(int) cluster_names = list(self.cluster_name.values()) n_clusters = len(clusters) bounds = np.arange(min(clusters) - 0.5, max(clusters) + 0.51) norm = mpl.colors.BoundaryNorm(bounds, cmap.N) # Default colorbar kwargs if self.cluster_by is None: cb_label = 'Cluster' else: cb_label = self.cluster_by cb_kwargs = { 'label': cb_label, 'norm': norm, 'ticks': clusters, 'ticklabels': cluster_names, 'cbrect': [0.8, 0.1, 0.02, 0.65], 'labelkwargs': { 'size': 14 }, 'tickparams': { 'labelsize': 13 } } # update with any user-specified kwargs cb_kwargs.update(cb_kw) qax.scatter(self.data[quat_axes].values, c=self.pred['cluster'], s=s, cmap=cmap, norm=norm, vmin=vmin, vmax=vmax, colorbar=colorbar, cb_kwargs=cb_kwargs, **scatter_kw) qax.axes_ticks(size=13, corners='rbt', offset=0.08) if gridlines == True: qax.gridlines(ls=':', LW=0.6) qax.ax.axis('off') return qax def reduce_comp_dims(self, kernel='poly', gamma=10, **kpca_kw): comp_dims = self.comp_dims.copy() if 'O' in comp_dims: comp_dims.remove('O') if 'Ba' in comp_dims: comp_dims.remove('Ba') print('Dimensions for KPCA reduction:', comp_dims) self.kpca_dims = comp_dims #self.reconstructed = self.data.copy() # reconstructed dims rc_dims = [f'{d}_kpca' for d in comp_dims] self.kpca = KernelPCA(kernel=kernel, n_components=2, fit_inverse_transform=True, gamma=gamma, **kpca_kw) # self.reduced = self.data_pred.copy() # write reduced dimensions to pred (can't write to data_pred - it is basically just a SQL view) self.pred['v1'] = 0 self.pred['v2'] = 0 self.pred[['v1', 'v2']] = self.kpca.fit_transform(self.data[comp_dims]) self.pred[rc_dims] = pd.DataFrame(self.kpca.inverse_transform( self.pred[['v1', 'v2']]), index=self.pred.index) # self.reduced[self.prop_dim] = self.data[self.prop_dim].values # self.reduced['outlier_flag'] = self.pred['outlier_flag'].values # self.reduced['cluster'] = self.pred['cluster'].values error = np.linalg.norm(self.data[comp_dims].values - self.pred[rc_dims].values, ord=2) print('Reconstruction error:', error) #return self.reduced, error def quat_reconstruction(self, ax=None, figsize=(8, 6), gridlines=True, color_col=None, cb_label=None, s=3, data_filter=None, **scatter_kw): """ Plot reconstructed composition data """ rc_dims = [f'{d}_kpca' for d in self.kpca_dims] self.quat_plot(ax, figsize, rc_dims, gridlines, color_col, cb_label, s, data_filter, **scatter_kw) def reduced_plot(self, ax=None, cmap=plt.cm.viridis, vmin=None, vmax=None, cbar=True, cbrect=[0.88, 0.12, 0.02, 0.75], **kwargs): """ scatter plot of prop_dim in reduced-dimension composition space Args: ----- kwargs: kwargs to pass to plt.scatter """ if ax is None: fig, ax = plt.subplots() else: fig = plt.gcf() if vmin is None: vmin = self.reduced[self.prop_dim].min() if vmax is None: vmax = self.reduced[self.prop_dim].max() ax.scatter(self.reduced['v1'], self.reduced['v2'], c=self.reduced[self.prop_dim], cmap=cmap, vmin=vmin, vmax=vmax, **kwargs) if cbar == True: add_colorbar(fig=fig, ax=ax, cmap=cmap, label=self.prop_dim, vmin=vmin, vmax=vmax, subplots_adjust=dict(left=0.1, right=0.8), cbrect=cbrect) ax.set_xlabel('$v_1$') ax.set_ylabel('$v_2$') def reduced_highlight_plot(self, ax=None, cmap=plt.cm.viridis, vmin=None, vmax=None, s=8, cbar=True, cbrect=[0.88, 0.12, 0.02, 0.75], **kwargs): """ scatter plot of prop_dim in reduced-dimension composition space with outliers highlighted in red Args: ----- ax: axis on which to plot. if None, create new axis cmap: colormap vmin: vmin for colormap vmax: vmax for colormap s: marker size cbar: if True, create a colorbar cbrect: colorbar rectangle: [left, bottom, width, height] kwargs: kwargs to pass to plt.scatter """ outliers = self.reduced.loc[self.reduced['outlier_flag'] == -1, :] inliers = self.reduced.loc[self.reduced['outlier_flag'] != -1, :] if ax is None: fig, ax = plt.subplots() else: fig = plt.gcf() if vmin is None: vmin = self.reduced[self.prop_dim].min() if vmax is None: vmax = self.reduced[self.prop_dim].max() ax.scatter(inliers['v1'], inliers['v2'], c=inliers[self.prop_dim], cmap=cmap, vmin=vmin, vmax=vmax, s=s, **kwargs) ax.scatter(outliers['v1'], outliers['v2'], c=outliers[self.prop_dim], cmap=cmap, vmin=vmin, vmax=vmax, s=s * 2, edgecolors='r', linewidths=0.7, **kwargs) if cbar == True: add_colorbar(fig=fig, ax=ax, cmap=cmap, label=self.prop_dim, vmin=vmin, vmax=vmax, subplots_adjust=dict(left=0.1, right=0.8), cbrect=cbrect) ax.set_xlabel('$v_1$') ax.set_ylabel('$v_2$') return ax def reduced_inlier_plot(self, ax=None, cmap=plt.cm.viridis, vmin=None, vmax=None, cbar=True, cbrect=[0.88, 0.12, 0.02, 0.75], **kwargs): inliers = self.reduced.loc[self.reduced['outlier_flag'] != -1, :] if ax is None: fig, ax = plt.subplots() else: fig = plt.gcf() if vmin is None: vmin = self.reduced[self.prop_dim].min() if vmax is None: vmax = self.reduced[self.prop_dim].max() ax.scatter(inliers['v1'], inliers['v2'], c=inliers[self.prop_dim], cmap=cmap, vmin=vmin, vmax=vmax, **kwargs) if cbar == True: add_colorbar(fig=fig, ax=ax, cmap=cmap, label=self.prop_dim, vmin=vmin, vmax=vmax, subplots_adjust=dict(left=0.1, right=0.8), cbrect=cbrect) ax.set_xlabel('$v_1$') ax.set_ylabel('$v_2$') return ax def reduced_outlier_plot(self, ax=None, cmap=plt.cm.viridis, vmin=None, vmax=None, cbar=True, cbrect=[0.88, 0.12, 0.02, 0.75], **kwargs): """ scatter plot of prop_dim in reduced-dimension composition space with outliers highlighted in red Args: ----- kwargs: kwargs to pass to plt.scatter """ outliers = self.reduced.loc[self.reduced['outlier_flag'] == -1, :] if ax is None: fig, ax = plt.subplots() else: fig = plt.gcf() if vmin is None: vmin = self.reduced[self.prop_dim].min() if vmax is None: vmax = self.reduced[self.prop_dim].max() ax.scatter(outliers['v1'], outliers['v2'], c=outliers[self.prop_dim], cmap=cmap, vmin=vmin, vmax=vmax, **kwargs) if cbar == True: add_colorbar(fig=fig, ax=ax, cmap=cmap, label=self.prop_dim, vmin=vmin, vmax=vmax, subplots_adjust=dict(left=0.1, right=0.8), cbrect=cbrect) ax.set_xlabel('$v_1$') ax.set_ylabel('$v_2$') return ax def reduced_cluster_plot(self, ax=None, cmap=plt.cm.plasma, cbar=True, cbrect=[0.88, 0.12, 0.02, 0.75], **kwargs): vmin = self.pred['cluster'].min() vmax = self.pred['cluster'].max() #make norm for discrete colormap clusters = list(self.cluster_name.keys()) cluster_names = list(self.cluster_name.values()) n_clusters = len(self.pred['cluster'].unique()) bounds = np.arange(min(clusters) - 0.5, max(clusters) + 0.51) norm = mpl.colors.BoundaryNorm(bounds, cmap.N) if ax is None: fig, ax = plt.subplots() else: fig = plt.gcf() ax.scatter(self.reduced['v1'], self.reduced['v2'], c=self.reduced['cluster'], cmap=cmap, norm=norm, **kwargs) ax.set_xlabel('$v_1$') ax.set_ylabel('$v_2$') if cbar == True: add_colorbar(fig=fig, ax=ax, cmap=cmap, norm=norm, label='Cluster', ticks=clusters, ticklabels=cluster_names, subplots_adjust=dict(left=0.1, right=0.8), cbrect=cbrect) return ax def cluster_sample(self): if self.cluster_by == 'sample': return self.cluster_name if 'sample' in self.data.columns: cluster_sample = {} for cluster, cdf in self.data_pred.groupby('cluster'): cluster_sample[cluster] = list(cdf['sample'].unique()) return cluster_sample else: raise Exception('Data does not contain sample column')
Created on Wed Apr 1 16:36:12 2020 @author: Niloy """ #DB SCAN clustering #essential libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt #import dataset dataset = pd.read_csv('dataset.csv', error_bad_lines=False) X = dataset.loc[:, ['latitude1', 'longitude1']] #import dbscan from sklearn.cluster import DBSCAN dbscan = DBSCAN(eps=5, min_samples=5) model = dbscan.fit(X) labels = model.labels_ from sklearn import metrics sample_cores = np.zeros_like(labels, dtype=bool) sample_cores[dbscan.core_sample_indices_] = True n_clusters = len(set(labels)) - (1 if -1 in labels else 0) print(metrics.silhouette_score(X, labels))
# coding: utf-8 # In[1]: import numpy as np from sklearn.cluster import DBSCAN import matplotlib.pyplot as plt # In[2]: data = np.genfromtxt('kmeans.txt', delimiter=' ') # In[3]: model = DBSCAN(eps=1, min_samples=4) model.fit(data) # In[4]: result = model.fit_predict(data) result # In[5]: mark = ['or', 'ob', 'og', 'oy', 'ok', 'om'] for i, d in enumerate(data): plt.plot(d[0], d[1], mark[result[i]]) plt.show() # In[ ]:
def clusterByDbscan(dataList, epsRadius, minSamples): """ # DBSCAN算法:将簇定义为密度相连的点最大集合,能够把具有足够高密度的区域划分为簇,并且可在噪声的空间数据集中发现任意形状的簇。 # 密度:空间中任意一点的密度是以该点为圆心,以EPS为半径的圆区域内包含的点数目 # 边界点:空间中某一点的密度,如果小于某一点给定的阈值min_samples,则称为边界点 # 噪声点:不属于核心点,也不属于边界点的点,也就是密度为1的点 #DBSCAN的算法,聚类结果不错,因为是按照设定的人的活动半径的密度可达来聚合的,但其结果是将数据集合分类,并不求出中心点。 dataList:[{"id","lat","lng",""},{"id","lat","lng",""}],必需包括id,lat,lng字段 epsRadius: 聚内直线距离(km) minSamples:最少点数目 return: {'noiseIds': [24, 25], 'clusterSet': [{'clusterCoreIds': [20, 21, 26], 'clusterCenterId': 26.0, 'clusterAroundIds': []}, {'clusterCoreIds': [22, 23], 'clusterCenterId': 22.0, 'clusterAroundIds': []}]} """ if epsRadius and dataList: df = pd.DataFrame(dataList) df["lat"] = df["lat"].astype(float) df["lng"] = df["lng"].astype(float) X = df[["lat", "lng"]] distance_matrix = squareform(pdist(X, (lambda u, v: haversine(u, v)))) #选取0.5公里(500m)作为密度聚合半径参数,在此使用球面距离来衡量地理位置的距离,来作为聚合的半径参数。 # 聚合所需指定的min_samples数目为3个(一个聚合点至少是三个人) # db = DBSCAN(eps=0.5, min_samples=3, metric='precomputed') #如果你将metric设置成了precomputed的话,那么传入的X参数应该为各个向量之间的相似度矩阵, # 然后fit函数会直接用你这个矩阵来进行计算.否则的话,你还是要乖乖地传入(n_samples, n_features)形式的向量. db = DBSCAN( epsRadius, minSamples, metric='precomputed') #通过metric='precomputed'计算稀疏的半径临近图,这会节省内存使用 db.fit(distance_matrix) #模型的训练 y = db.fit_predict(distance_matrix) #模型的预测方法 #标记聚类点对应下标为True coreSamplesMask = zeros_like(db.labels_, dtype=bool) coreSamplesMask[db.core_sample_indices_] = True #print(db.core_sample_indices_) #聚类标签(数组,表示每个样本所属聚类)和所有聚类的数量,标签-1对应的样本表示异常点 clusterLabels = db.labels_ uniqueClusterLabels = set(clusterLabels) nClusters = len(uniqueClusterLabels) - (-1 in clusterLabels) #print(nClusters) # 异常点 clusterInfo = {} offset_mask = (clusterLabels == -1) noiseIds = df.loc[offset_mask, ["id"]].values clusterInfo["noiseIds"] = noiseIds.flatten().tolist() clusterSet = [] for i, clusterLabel in enumerate(uniqueClusterLabels): #clusterIndex是个True/Fasle数组,其中True表示对应样本为聚类点 clusterData = {} if clusterLabel != -1: clusterIndex = (clusterLabels == clusterLabel) #计算聚类点集的中心点 clusterDf = df.loc[clusterIndex & coreSamplesMask, ["id", "lat", "lng"]] clusterCorePoints = df.loc[clusterIndex & coreSamplesMask, ["id"]].values clusterData["clusterCoreIds"] = clusterCorePoints.flatten( ).tolist() clusterData["clusterCenterId"] = calcClusterCenter( clusterDf)[0] #边界点 aroundPoints = df.loc[(clusterIndex & ~coreSamplesMask), ["id"]].values clusterData["clusterAroundIds"] = aroundPoints.flatten( ).tolist() clusterSet.append(clusterData) clusterInfo["clusterSet"] = clusterSet return clusterInfo
'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_srv_diff_host_rate', 'service_ecr_i', 'service_private', 'service_http', 'service_eco_i', 'service_other', 'service_ftp_data', 'service_smtp', 'service_ftp', 'service_domain_u', 'service_telnet', 'protocol_type_icmp', 'protocol_type_tcp', 'flag_SF', 'flag_S0', 'protocol_type_udp', 'flag_REJ', 'flag_RSTR', 'flag_SH' ] final_pandas_df_2 = final_pandas_df_1[impcol] #final_pandas_df.info() final_pandas_df = final_pandas_df_2[0:10000] target = target_1[0:10000] dbscan = DBSCAN(eps=3, algorithm='kd_tree', min_samples=5) dbscan.fit(final_pandas_df) #print(dbscan.labels_) labels = dbscan.labels_ final_pandas_df['acctual_response'] = target final_pandas_df['preditions'] = labels # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) #final_pandas_df.to_csv('F:/sem3/big_data/final_project/New folder/sample_test50k.csv',sep = '\t') from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) dff = sqlCtx.createDataFrame(final_pandas_df) print(type(dff))
[1.5, 3.75], [1.75, 3.25], [2.0, 3.5], [3.0, 2.25], [3.5, 1.75], [3.75, 8.75], [3.95, 0.9], [4.0, 1.5], [2.5, 2.75], [2.25, 2.25], [2.0, 3.5], [2.75, 1.75], [4.5, 1.1], [5.0, 9.0], [8.75, 5.15], [8.0, 2.25], [8.25, 3.0], [8.5, 4.75], [8.5, 4.25], [8.25, 3.35], [7.0, 1.75], [8.0, 3.5], [6.0, 1.25], [5.5, 1.75], [5.25, 1.25], [4.9, 1.25], [5.0, 1.5], [7.5, 2.25], [7.75, 2.75], [6.75, 2.0], [6.25, 1.75], [4.5, 1.1], [3.0, 4.5], [7.0, 4.5], [5.0, 3.0], [4.0, 3.35], [6.0, 3.35], [4.25, 3.25], [5.75, 3.25], [3.5, 3.75], [6.5, 3.75], [3.25, 4.0], [6.75, 4.0], [3.75, 3.55], [6.25, 3.55], [4.75, 3.05], [5.25, 3.05], [4.5, 3.15], [5.5, 3.15], [4.0, 6.5], [4.0, 6.75], [4.0, 6.25], [3.75, 6.5], [4.25, 6.5], [4.25, 6.75], [3.75, 6.25], [6.0, 6.5], [6.0, 6.75], [6.0, 6.25], [5.75, 6.75], [5.75, 6.25], [6.25, 6.75], [6.25, 6.25], [9.5, 9.5], [2.5, 9.5], [1.0, 8.0]] data = np.asarray(X) dbscan = DBSCAN(eps=2, min_samples=10) dbscan.fit(data) pca = PCA(n_components=2).fit(data) pca_2d = pca.transform(data) cnt1 = cnt2 = cnt3 = cnt4 = cnt5 = cnt6 = cnt7 = cnt8 = cnt9 = cnt10 = cnt11 = cnt12 = 0 for i in range(0, pca_2d.shape[0]): if dbscan.labels_[i] == -1: c1 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='r', marker='x') cnt1 = cnt1 + 1 elif dbscan.labels_[i] == 0: c2 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='g', marker=(8, 2)) cnt2 = cnt2 + 1 elif dbscan.labels_[i] == 1: c3 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='b', marker=(8, 2)) cnt3 = cnt3 + 1 elif dbscan.labels_[i] == 2:
def do_clustering(target_csv, cluster_method): num_cluster = 24 df_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, target_csv + '.csv'), index_col=0, header=0, encoding='utf-8-sig') df_data.index.name = 'short_code' print(df_data.iloc[:100]) print(df_data.shape) start_time = time.time() if cluster_method == 0: clustering = DBSCAN(eps=0.3, min_samples=1000) clustering.fit(df_data) csv_name = 'clustered_dbscan_' + target_csv + '.csv' elif cluster_method == 1: clustering = OPTICS(min_samples=1000, metric='cosine') clustering.fit(df_data) csv_name = 'clustered_optics_' + target_csv + '.csv' elif cluster_method == 2: clustering = AgglomerativeClustering(n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_ward_' + target_csv + '.csv' elif cluster_method == 3: clustering = AgglomerativeClustering(affinity='cosine', linkage='complete', n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_agglo_complete_' + target_csv + '.csv' elif cluster_method == 4: clustering = AgglomerativeClustering(affinity='cosine', linkage='single', n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_agglo_single_' + target_csv + '.csv' elif cluster_method == 5: clustering = Birch(n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_birch_' + target_csv + '.csv' elif cluster_method == 6: clustering = KMeans(n_clusters=num_cluster) clustering.fit(df_data) csv_name = 'clustered_kmeans_' + target_csv + '.csv' elif cluster_method == 7: clustering = SpectralClustering(n_clusters=num_cluster, random_state=42, assign_labels='discretize') clustering.fit(df_data) csv_name = 'clustered_spectral_' + target_csv + '.csv' print("time elapsed for clustering: " + str(time.time() - start_time)) print(clustering.get_params()) print(clustering.labels_) count_percentage(clustering.labels_) result_df = pd.DataFrame(data=clustering.labels_, index=df_data.index, columns=['cluster']) start_time = time.time() print("calinski_harabasz_score: ", calinski_harabasz_score(df_data, result_df['cluster'].squeeze())) print("silhouette_score: ", silhouette_score(df_data, result_df['cluster'].squeeze())) print("davies_bouldin_score: ", davies_bouldin_score(df_data, result_df['cluster'].squeeze())) print("time elapsed for scoring: " + str(time.time() - start_time)) result_df.to_csv(os.path.join(CONFIG.CSV_PATH, csv_name), encoding='utf-8-sig')
import pandas as pd import numpy as np from sklearn.cluster import DBSCAN from sklearn import metrics eps = 2 minpts = 2 # Create condensed distance vector df = pd.read_csv('normalized_undersampled.csv') dbs = DBSCAN(eps=eps, min_samples=minpts) db = dbs.fit(df) print(db) labels_sklearn = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels_sklearn)) - (1 if -1 in labels_sklearn else 0) # Evaluate real_label = [] with open('CencusIncomeUndersampled.csv', 'r') as f: lines = f.readlines() count = 0 for line in lines: x = line.split(',') if (x[-1] == '<=50K\n'): real_label.append(0.0) elif (x[-1] == '>50K\n'): real_label.append(1.0) print(labels_sklearn)
k = int(input("Which k yielded the best silhouette score? ")) kmeans = KMeans(n_clusters=k) kmeans.fit(dataset) print(kmeans.labels_) print(silhouette_score(dataset, kmeans.labels_, metric='euclidean')) print("*****") agglomerative = AgglomerativeClustering() agglomerative.fit(dataset) print(agglomerative.labels_) print(silhouette_score(dataset, agglomerative.labels_, metric='euclidean')) print("*****") scan = DBSCAN(eps=0.5, min_samples=2) scan.fit(dataset) print(scan.labels_) print(silhouette_score(dataset, scan.labels_, metric='euclidean')) print("*****") scan2 = DBSCAN(eps=0.55, min_samples=2) scan2.fit(dataset) print(scan2.labels_) print(silhouette_score(dataset, scan2.labels_, metric='euclidean')) print("*****") scan3 = DBSCAN(eps=0.6, min_samples=3) scan3.fit(dataset) print(scan3.labels_) print(silhouette_score(dataset, scan3.labels_, metric='euclidean')) print("*****")
def main(): # get images image_1_path = e1.get() image_2_path = e2.get() try: image_1_RGB = plt.imread(image_1_path) image_2_RGB = plt.imread(image_2_path) color_tolerance = float(e4.get()) cluster_tolerance = float(e3.get()) pass except: state.set('ERROR') lstate.config(bg='#FF7F7F') window.update_idletasks() messagebox.showinfo(title='ERROR', message='输入错误!') return None pass # update the state lstate.config(bg='#7FFF7F') window.update_idletasks() # show image state.set('显示图片中。。。') window.update_idletasks() img_open = Image.open(e1.get()) img = img_open.resize((128, 64)) img = ImageTk.PhotoImage(img) lp1.config(image=img) lp1.image = img window.update_idletasks() # show image img_open = Image.open(e2.get()) img = img_open.resize((128, 64)) img = ImageTk.PhotoImage(img) lp2.config(image=img) lp2.image = img window.update_idletasks() # resize to speed up image_1_RGB = Image.open(image_1_path) w_resize = 96 h_resize = int(w_resize * image_1_RGB.size[1] / image_1_RGB.size[0]) image_1_RGB = image_1_RGB.resize((w_resize, h_resize)) image_1_RGB = np.array(image_1_RGB) # resize to speed up image_2_RGB = Image.open(image_2_path) w_resize = 96 h_resize = int(w_resize * image_2_RGB.size[1] / image_2_RGB.size[0]) image_2_RGB = image_2_RGB.resize((w_resize, h_resize)) image_2_RGB = np.array(image_2_RGB) state.set('转换RGB为LAB中。。。') window.update_idletasks() image_1_LAB = cv2.cvtColor(image_1_RGB, cv2.COLOR_RGB2LAB) image_2_LAB = cv2.cvtColor(image_2_RGB, cv2.COLOR_RGB2LAB) # image 1 state.set('第一张图片聚类中。。。') window.update_idletasks() dbscan1 = DBSCAN(eps=cluster_tolerance, min_samples=3) h_1, w_1, c_1 = image_1_LAB.shape image_1_data = image_1_LAB.reshape((h_1 * w_1, c_1)) image_1_lab_data = [] for data in image_1_data: image_1_lab_data.append( [data[0] * 100 / 255, data[1] - 128, data[2] - 128]) pass image_1_lab_data = np.array(image_1_lab_data) dbscan1.fit(image_1_lab_data) labels = dbscan1.labels_ n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0) # find the cluster center theme_1 = [] cluster_area_1 = [] for i in range(n_clusters_1): one_cluster = image_1_lab_data[labels == i] km = KMeans(n_clusters=1, max_iter=600) km.fit(one_cluster) theme_1.append(np.squeeze(km.cluster_centers_)) cluster_area_1.append(len(one_cluster) / len(image_1_lab_data)) pass theme_1 = np.array(theme_1) # show image uint8_theme_1 = [] for theme in theme_1: uint8_theme_1.append( [theme[0] * 255 / 100, theme[1] + 128, theme[2] + 128]) pass uint8_theme_1 = np.array(uint8_theme_1) pic_array = cv2.cvtColor( np.uint8(uint8_theme_1.reshape(1, len(uint8_theme_1), 3)), cv2.COLOR_LAB2RGB) pic_array = make_image(pic_array[0]) pic = Image.fromarray(pic_array.astype('uint8')).convert('RGB') img = ImageTk.PhotoImage(pic) lp1c.config(image=img) lp1c.image = img window.update_idletasks() # image 2 state.set('第二张图片聚类中。。。') window.update_idletasks() dbscan2 = DBSCAN(eps=cluster_tolerance, min_samples=3) h_2, w_2, c_2 = image_2_LAB.shape image_2_data = image_2_LAB.reshape((h_2 * w_2, c_2)) image_2_lab_data = [] for data in image_2_data: image_2_lab_data.append( [data[0] * 100 / 255, data[1] - 128, data[2] - 128]) pass image_2_lab_data = np.array(image_2_lab_data) dbscan2.fit(image_2_lab_data) labels = dbscan2.labels_ n_clusters_2 = len(set(labels)) - (1 if -1 in labels else 0) # find the cluster center theme_2 = [] cluster_area_2 = [] for i in range(n_clusters_2): one_cluster = image_2_lab_data[labels == i] km = KMeans(n_clusters=1, max_iter=600) km.fit(one_cluster) theme_2.append(np.squeeze(km.cluster_centers_)) cluster_area_2.append(len(one_cluster) / len(image_2_lab_data)) pass theme_2 = np.array(theme_2) # show image uint8_theme_2 = [] for theme in theme_2: uint8_theme_2.append( [theme[0] * 255 / 100, theme[1] + 128, theme[2] + 128]) pass uint8_theme_2 = np.array(uint8_theme_2) pic_array = cv2.cvtColor( np.uint8(uint8_theme_2.reshape(1, len(uint8_theme_2), 3)), cv2.COLOR_LAB2RGB) pic_array = make_image(pic_array[0]) pic = Image.fromarray(pic_array.astype('uint8')).convert('RGB') img = ImageTk.PhotoImage(pic) lp2c.config(image=img) lp2c.image = img window.update_idletasks() state.set('聚类完成') window.update_idletasks() def calc_chromatism(lab1, lab2): deltaL = lab1[0] - lab2[0] deltaA = lab1[1] - lab2[1] deltaB = lab1[2] - lab2[2] deltaE = (deltaL**2 + deltaA**2 + deltaB**2)**0.5 return deltaE ''' # image 1 area image1_color_area = [] state.set('计算图片一各颜色面积占比中。。。'+str(0)+'%') window.update_idletasks() for i in range(n_clusters_1): num_same_pixs = 0 L1 = theme_1[i][0] A1 = theme_1[i][1] B1 = theme_1[i][2] LAB1 = [L1, A1, B1] for j in range(0, h_1*w_1): L2 = image_1_lab_data[j][0] A2 = image_1_lab_data[j][1] B2 = image_1_lab_data[j][2] LAB2 = [L2, A2, B2] deltaE = calc_chromatism(LAB1, LAB2) if deltaE <= color_tolerance: num_same_pixs += 1 pass pass area = num_same_pixs/(h_1*w_1) image1_color_area.append(area) state.set('计算图片一各颜色面积占比中。。。'+str(int(100*(i+1)/n_clusters_1))+'%') window.update_idletasks() pass #print(sum(image1_color_area)) # image 2 area image2_color_area = [] state.set('计算图片二各颜色面积占比中。。。'+str(0)+'%') window.update_idletasks() for i in range(n_clusters_2): num_same_pixs = 0 L1 = theme_2[i][0] A1 = theme_2[i][1] B1 = theme_2[i][2] LAB1 = [L1, A1, B1] for j in range(0, h_2*w_2): L2 = image_2_lab_data[j][0] A2 = image_2_lab_data[j][1] B2 = image_2_lab_data[j][2] LAB2 = [L2, A2, B2] deltaE = calc_chromatism(LAB1, LAB2) if deltaE <= color_tolerance: num_same_pixs += 1 pass pass area = num_same_pixs/(h_2*w_2) image2_color_area.append(area) state.set('计算图片二各颜色面积占比中。。。'+str(int(100*(i+1)/n_clusters_2))+'%') window.update_idletasks() pass #print(sum(image2_color_area)) state.set('面积占比计算完成') window.update_idletasks() ''' ''' image1_color_area image2_color_area cluster_area_1 cluster_area_2 ''' Image_1_Area = cluster_area_1[:] Image_2_Area = cluster_area_2[:] print(np.sum(Image_1_Area)) print(np.sum(Image_2_Area)) state.set('共同色选取中。。。') window.update_idletasks() common_color = [] common_area = [] common_color_A = [] common_color_B = [] for i in range(n_clusters_1): L1 = theme_1[i][0] A1 = theme_1[i][1] B1 = theme_1[i][2] LAB1 = [L1, A1, B1] for j in range(n_clusters_2): L2 = theme_2[j][0] A2 = theme_2[j][1] B2 = theme_2[j][2] LAB2 = [L2, A2, B2] deltaE = calc_chromatism(LAB1, LAB2) if deltaE <= color_tolerance: S1 = Image_1_Area[i] / (Image_1_Area[i] + Image_2_Area[j]) S2 = Image_2_Area[j] / (Image_1_Area[i] + Image_2_Area[j]) L3 = L1 * S1 + L2 * S2 A3 = A1 * S1 + A2 * S2 B3 = B1 * S1 + B2 * S2 L1 = round(L1, 3) A1 = round(A1, 3) B1 = round(B1, 3) L2 = round(L2, 3) A2 = round(A2, 3) B2 = round(B2, 3) L3 = round(L3, 3) A3 = round(A3, 3) B3 = round(B3, 3) LAB1 = [L1, A1, B1] LAB2 = [L2, A2, B2] LAB3 = [L3, A3, B3] common_color_A.append(LAB1) common_color_B.append(LAB2) common_color.append(LAB3) common_area.append((Image_1_Area[i], Image_2_Area[j])) pass pass pass #print(common_color) #print(common_area) state.set('共同色选取完成') window.update_idletasks() title = ' ' * 22 + 'LAB' + ' ' * ( 48 - 3) + 'A' + ' ' * 48 + 'B' + ' ' * 32 + 'Std Color' listbox.delete(0, tk.END) listbox.insert(tk.END, title) window.update_idletasks() result_info = [] for i in range(len(common_color)): #info = '{:4d}'.format(i+1) + ' '*4 info = '[{:3.3f} {:3.3f} {:3.3f}]'.format(common_color[i][0], \ common_color[i][1], common_color[i][2]) info += ' ' * (28 - len(info)) info += '{:3.2f}'.format(100 * common_area[i][0]) + '%' + ' ' * 4 info += '[{:3.3f} {:3.3f} {:3.3f}]'.format(common_color_A[i][0], \ common_color_A[i][1], common_color_A[i][2]) info += ' ' * (64 - len(info)) info += '{:3.2f}'.format(100 * common_area[i][1]) + '%' + ' ' * 4 info += '[{:3.3f} {:3.3f} {:3.3f}]'.format(common_color_B[i][0], \ common_color_B[i][1], common_color_B[i][2]) info += ' ' * (100 - len(info)) selected_std_color = select_std_color(common_color[i]) info += selected_std_color res = (selected_std_color, info) result_info.append(res) pass colors = [] dict_colors = {} nums = [] for i in range(len(result_info)): colors.append(result_info[i][0]) colors_set = set(colors) pass for color in colors_set: num = colors.count(color) if str(num) not in dict_colors.keys(): nums.append(num) dict_colors[str(num)] = [color] pass else: dict_colors[str(num)].append(color) pass pass #print(dict_colors) index = 0 while dict_colors != {}: num = max(nums) key = str(num) for color in dict_colors[key]: LAB1 = std_colors[color] num_same_pixs = 0 for n in range(0, h_1 * w_1): L2 = image_1_lab_data[n][0] A2 = image_1_lab_data[n][1] B2 = image_1_lab_data[n][2] LAB2 = [L2, A2, B2] deltaE = calc_chromatism(LAB1, LAB2) if deltaE <= color_tolerance: num_same_pixs += 1 pass pass area_A = num_same_pixs / (h_1 * w_1) num_same_pixs = 0 for n in range(0, h_2 * w_2): L2 = image_2_lab_data[n][0] A2 = image_2_lab_data[n][1] B2 = image_2_lab_data[n][2] LAB2 = [L2, A2, B2] deltaE = calc_chromatism(LAB1, LAB2) if deltaE <= color_tolerance: num_same_pixs += 1 pass pass area_B = num_same_pixs / (h_2 * w_2) area = [round(100 * area_A, 2), round(100 * area_B, 2)] for color_info in result_info: if color_info[0] == color: index += 1 std_color_pic = [[[ std_colors[color_info[0]][0] * 255 / 100, std_colors[color_info[0]][1] + 128, std_colors[color_info[0]][2] + 128 ]]] RGB_pic = cv2.cvtColor(np.uint8(std_color_pic), cv2.COLOR_LAB2RGB) RGB = np.squeeze(RGB_pic) listbox.insert(tk.END, ' ') c = '#' R = RGB[0] R = hex(R) R = str(R)[2:] if len(R) == 1: R += '0' pass G = RGB[1] G = hex(G) G = str(G)[2:] if len(G) == 1: G += '0' pass B = RGB[2] B = hex(B) B = str(B)[2:] if len(B) == 1: B += '0' pass c += R + G + B #print(c) listbox.itemconfig(tk.END, bg=c) info = '{:4d}'.format(index) + ' ' * 4 info += color_info[1][:] info += ' ' * 4 + '[{:3.2f}% {:3.2f}%]'.format( area[0], area[1]) listbox.insert(tk.END, info) window.update_idletasks() pass pass pass del dict_colors[key] nums.remove(num) pass scrollbar.config(command=listbox.yview) window.update_idletasks() pass
while n <= 2: labImg = cv.pyrDown(labImg) n = n + 1 #Squash image feature vector, 3 channels featureImg = np.reshape(labImg, ([-1, 3])) row, col, ch = labImg.shape #flover image #db = DBSCAN(eps = 5, min_samples = 10, metric = 'euclidean', algorithm = 'auto') #Highway image #db = DBSCAN(eps = 5, min_samples = 5, metric = 'euclidean', algorithm = 'brute') #cars image db = DBSCAN(eps=5, min_samples=10, metric='euclidean', algorithm='auto') db.fit(featureImg) labels = db.labels_ components = db.components_ indices = np.dstack(np.indices(labImg.shape[:2])) xyColors = np.concatenate((labImg, indices), axis=-1) featureImage2 = np.reshape(xyColors, ([-1, 5])) db.fit(featureImage2) labels2 = db.labels_ components2 = db.components_ figureSize = 10 plt.plot(figsize=(figureSize, figureSize)) plt.subplot(1, 2, 1) plt.imshow(image) plt.subplot(1, 2, 2)
def cluster(X): dbscan = DBSCAN(metric='precomputed') db = dbscan.fit(X) print(db.labels_)
def main(args): pnet, rnet, onet = create_network_face_detection(args.gpu_memory_fraction) with tf.Graph().as_default(), tf.device('/device:GPU:0'): with tf.Session() as sess: facenet.load_model(args.model) image_list = load_images_from_folder(args.data_dir) images = align_data(image_list, args.image_size, args.margin, pnet, rnet, onet) images_placeholder = sess.graph.get_tensor_by_name("input:0") embeddings = sess.graph.get_tensor_by_name("embeddings:0") phase_train_placeholder = sess.graph.get_tensor_by_name( "phase_train:0") feed_dict = { images_placeholder: images, phase_train_placeholder: False } emb = sess.run(embeddings, feed_dict=feed_dict) nrof_images = len(images) matrix = np.zeros((nrof_images, nrof_images)) print('') # Print distance matrix print('Distance matrix') print(' ', end='') for i in range(nrof_images): print(' %1d ' % i, end='') print('') for i in range(nrof_images): print('%1d ' % i, end='') for j in range(nrof_images): dist = np.sqrt( np.sum(np.square(np.subtract(emb[i, :], emb[j, :])))) matrix[i][j] = dist print(' %1.4f ' % dist, end='') print('') print('') # DBSCAN is the only algorithm that doesn't require the number of clusters to be defined. db = DBSCAN(eps=args.cluster_threshold, min_samples=args.min_cluster_size, metric='precomputed') db.fit(matrix) labels = db.labels_ # get number of clusters no_clusters = len(set(labels)) - (1 if -1 in labels else 0) print('No of clusters:', no_clusters) if no_clusters > 0: if args.largest_cluster_only: largest_cluster = 0 for i in range(no_clusters): print('Cluster {}: {}'.format( i, np.nonzero(labels == i)[0])) if len(np.nonzero(labels == i)[0]) > len( np.nonzero(labels == largest_cluster)[0]): largest_cluster = i print('Saving largest cluster (Cluster: {})'.format( largest_cluster)) cnt = 1 for i in np.nonzero(labels == largest_cluster)[0]: misc.imsave( os.path.join(args.out_dir, str(cnt) + '.png'), images[i]) cnt += 1 else: print('Saving all clusters') for i in range(no_clusters): cnt = 1 print('Cluster {}: {}'.format( i, np.nonzero(labels == i)[0])) path = os.path.join(args.out_dir, str(i)) if not os.path.exists(path): os.makedirs(path) for j in np.nonzero(labels == i)[0]: misc.imsave( os.path.join(path, str(cnt) + '.png'), images[j]) cnt += 1 else: for j in np.nonzero(labels == i)[0]: misc.imsave( os.path.join(path, str(cnt) + '.png'), images[j]) cnt += 1
def DBSCAN_Clusterization(X, EPS, MIN_SAMPLES): DBClusters = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, metric='euclidean', algorithm='auto') #'kd_tree') DBClusters.fit(X) #DBClusters.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set( DBClusters.labels_)) - (1 if -1 in DBClusters.labels_ else 0) core_samples = np.zeros_like(DBClusters.labels_, dtype=bool) core_samples[DBClusters.core_sample_indices_] = True # PRINT CLUSTERS & # of CLUSTERS # print("Clusters:"+str(DBClusters.labels_)) # # print('Estimated number of clusters: %d' % n_clusters_) clusters = [X[DBClusters.labels_ == i] for i in range(n_clusters_)] outliers = X[DBClusters.labels_ == -1] if plot: plt.clf() # Plot Outliers plt.scatter(outliers[:, 0], outliers[:, 1], c="black", label="Outliers") # Plot Clusters cmap = get_cmap(len(clusters)) x_clusters = [None] * len(clusters) y_clusters = [None] * len(clusters) #colors = [0] colors = "bgrcmykw" color_index = 0 for i in range(len(clusters)): x_clusters[i] = [] y_clusters[i] = [] # print("Tamano Cluster "+ str(i) + ": " + str(len(clusters[i]))) for j in range(len(clusters[i])): x_clusters[i].append(clusters[i][j][0]) y_clusters[i].append(clusters[i][j][1]) # if plot: plt.scatter(x_clusters[i], y_clusters[i], label="Cluster %d" % i, s=8**2, c=colors[color_index]) #c=cmap(i)) if color_index == len(colors) - 1: color_index = 0 else: color_index += 1 if plot: #plot the Clusters #plt.title("Clusters Vs Serving UABS") plt.scatter(x2, y2, c="yellow", label="UABSs", s=10**2) #plot UABS new position plt.xlabel('x (meters)', fontsize=16) plt.ylabel('y (meters)', fontsize=16) plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=5) plt.savefig( "Graph_Clustered_UOS_Scenario {}s.pdf".format(simulation_time), format='pdf', dpi=1000) plt.show() return clusters, x_clusters, y_clusters
y = np.reshape(chunk["victim_position_y"].values, (-1, 1)) if y is None else \ np.vstack((y, np.reshape(chunk["victim_position_y"].values, (-1, 1)))) chunk = data.get_chunk(CHUNKSIZE) total_data_count = X.shape[0] print(total_data_count) training_data = np.array(list(zip(X, y))).reshape(-1, 2) eps_range = list(range(3500, 4000, 200)) sample_range = list(range(100, 300, 25)) # for e in eps_range: # for s in sample_range: model = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES) model.fit(training_data) # visualization group_size = max(model.labels_) # if group_size <= 10: # print("Break One s:{} e:{} max:{}".format(e, s, max(model.labels_))) # break # # if group_size > 100: # print("Continue One s:{} e:{} max:{}".format(e, s, max(model.labels_))) # continue for i in range(group_size): color = "#" + "".join([hex_range[randint(0, 15)] for _ in range(6)]) curr_x = [x for idx, x in enumerate(X) if model.labels_[idx] == i]
""" 8.6.3 基于密度的空间聚类 """ from sklearn import datasets as dss from sklearn.cluster import DBSCAN import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['FangSong'] plt.rcParams['axes.unicode_minus'] = False X, y = dss.make_moons(n_samples=1000, noise=0.05) dbs_1 = DBSCAN() # 默认核心样本半径0.5,核心样本邻居5个 dbs_2 = DBSCAN(eps=0.2) # 核心样本半径0.2,核心样本邻居5个 dbs_3 = DBSCAN(eps=0.1) # 核心样本半径0.1,核心样本邻居5个 dbs_1.fit(X) dbs_2.fit(X) dbs_3.fit(X) plt.subplot(131) plt.title('eps=0.5') plt.scatter(X[:, 0], X[:, 1], c=dbs_1.labels_) plt.subplot(132) plt.title('eps=0.2') plt.scatter(X[:, 0], X[:, 1], c=dbs_2.labels_) plt.subplot(133) plt.title('eps=0.1') plt.scatter(X[:, 0], X[:, 1], c=dbs_3.labels_) plt.show()
from sklearn.cluster import KMeans from sklearn import datasets import numpy as np X,y = datasets.make_moons(n_samples=1500, noise=.05) x1 = X[:,0] x2 = X[:,1] print("This is the dataset we want to classify with DBSCAN!") plt.scatter(x1,x2,s=5) plt.show() #results with DBSCAN algorithm dbscan = DBSCAN(eps=0.1) dbscan.fit(X) y_pred = dbscan.labels_.astype(np.int) colors = np.array(['#ff0000', '#00ff00']) print("These are the clusters with DBSCAN!") plt.scatter(x1,x2,s=5,color=colors[y_pred]) plt.show() #results with K-Means Clustering kmeans = KMeans(n_clusters=2) kmeans.fit(X) y_pred = kmeans.labels_.astype(np.int) colors = np.array(['#ff0000', '#00ff00'])
x = pd.DataFrame(preprocessing.scale(x_original)) elif preprocess == 'norm': x = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(x_original)) else: x = x_original print('gotovo') # Dodeljivanje imena kolonama x.columns = features print('features:', features) print('=-----------------------------------------=') print('= klasterovanje =') print('=-----------------------------------------=') for eps in range(1, 21): for min_samples in range(0, 201, 10): if min_samples == 0: min_samples = min_samples + 1 print("eps:", eps) print("min_samples:", min_samples) est = DBSCAN(eps=eps * 0.1, min_samples=min_samples) est.fit(x) col_name = 'labels_' + str(eps) + '_' + str(min_samples) df[col_name] = est.labels_ num_clusters = max(est.labels_) + 1 print("number of clusters:", num_clusters) print('=-----------------------------------------=') df.to_csv('datasets/clustered_data.csv')
# -*- coding: utf-8 -*- """ Created on Sat Sep 19 19:47:14 2020 @author: João Victor """ from sklearn import metrics import pandas as pd from sklearn.datasets import load_wine from sklearn.cluster import DBSCAN wine = load_wine() modelo = wine.target dbscan = DBSCAN(eps=100, min_samples=50) print(dbscan) dbscan.fit(wine.data) resultado = dbscan.labels_ print(modelo) print(resultado) print(metrics.adjusted_mutual_info_score(modelo, resultado))
def _group(self, cutoff_index=64, algorithm="DBSCAN"): """ Groups elementary components using clustering algorithms. Parameters ---------- algorithm : str String specifying the clustering algorithm to be applied. Possible is ``DBSCAN`` or ``AffinityPropagation``. """ print "Computing clustering with cutoff at i = {}...".format(cutoff_index) assert hasattr(self, "_wcorr") assert isinstance(algorithm, str) assert algorithm in ("DBSCAN", "AffinityPropagation") # compute distance from correlation X = np.abs(self._wcorr[:cutoff_index,:cutoff_index] - 1.0) if algorithm is "DBSCAN": from sklearn.cluster import DBSCAN db = DBSCAN(min_samples=2, metric="precomputed") db.fit(X) labels = db.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noise = list(labels).count(-1) elif algorithm is "AffinityPropagation": from sklearn.cluster import AffinityPropagation af = AffinityPropagation(affinity="precomputed") labels = af.labels_ cluster_centers_indices = af.cluster_centers_indices_ n_clusters = len(cluster_centers_indices) # print user info print " Estimated number of clusters: {}".format(n_clusters) if algorithm is "DBSCAN": print " Estimated number of noise points: {}".format(n_noise) from sklearn import metrics s_score = metrics.silhouette_score(X, labels, metric="precomputed") print " Silhouette Coefficient: {:0.3f}".format(s_score) # extract cluster indices and power clusters = dict() _, _, total_power = self._compute_powers() for i in range(n_clusters): ind = np.where(labels == i)[0] relative_power = (self._s[ind]**2).sum() / total_power if relative_power > self.__power_threshold: print " Relative power of cluster {0}: {1:0.3f}".format(i, relative_power) clusters[i] = [tuple(ind), relative_power] # extract clusters from noise noise_ind = np.where(labels == -1)[0] if algorithm is "DBSCAN": assert noise_ind.size == n_noise j = n_clusters for i, ind in enumerate(noise_ind): relative_power = self._s[ind]**2 / total_power if relative_power > self.__power_threshold: print " Relative power of noise {0}: {1:0.3f}".format(i, relative_power) clusters[j] = [ind, relative_power] j += 1 # set final number of clusters self._n_groups = len(clusters) # sort according to power powers = np.array(zip(*clusters.values())[1]) sort_ind = np.argsort(powers)[::-1] self._group_power = powers[sort_ind] # compute groups if self._ndim == 1: self._ts_groups = np.zeros((self._n, self._n_groups)) else: self._ts_groups = np.zeros(self._n + (self._n_groups, )) for i, ind in enumerate(sort_ind): indices = clusters[ind][0] if isinstance(indices, (tuple, list, np.ndarray)): if len(indices) > 1: self._ts_groups[...,i] = self._ts_components[...,indices].sum(axis=-1) else: self._ts_groups[...,i] = self._ts_components[...,indices] else: self._ts_groups[...,i] = self._ts_components[...,indices]