def _fit_dbscan(self, x): # clustering for r in xrange(self.repeats): # info if self.debug is True: print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1), # fit and evaluate model model = DBSCAN(eps=1.0, min_samples=100) model.fit_predict(x) k = len(set(model.labels_)) - (1 if -1 in model.labels_ else 0) self._labels[r] = model.labels_ self._parameters[r] = model.core_sample_indices_ # build equivalent gmm model_gmm = GMM(n_components=k, covariance_type="full") model_gmm.means_ = model.core_sample_indices_ model_gmm.covars_ = sp.ones( (k, self.input_dim)) * self.sigma_factor model_gmm.weights_ = sp.array( [(self._labels[r] == i).sum() for i in xrange(k)]) # evaluate goodness of fit self._ll[r] = model_gmm.score(x).sum() if self.gof_type == 'aic': self._gof[r] = model_gmm.aic(x) if self.gof_type == 'bic': self._gof[r] = model_gmm.bic(x) # debug info if self.debug is True: print self._gof[r]
def dbscan_outliers(data, genes, eps, min_samples, max_samples=1, as_json=True): db = DBSCAN(eps=eps, min_samples=min_samples) # sd_scaler = StandardScaler() res = dr.get_dataset_ensembl_info() outliers_id = [] for g in genes: # scaled = sd_scaler.fit(data.loc[g, :]) fit = db.fit(np.reshape(data.loc[g, :], (196, 1))) candidates = itemfreq(fit.labels_) try: class_zero = candidates[0][1] class_one = candidates[1][1] support = min(class_one, class_zero) if min_samples < support <= max_samples: info = [gene for gene in res if gene.ensemblgeneid == g][0] formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support), "distance": "NA"} jinfo = json.dumps(formatted_info) jinfo += "," outliers_id.append(g) print("outlier found :" + g) if as_json: yield (jinfo) else: yield (formatted_info) except: pass
def dbscan_outliers(df): """ Find outliers (noise points) using DBSCAN. Parameters ---------- df: A pandas.DataFrame Returns ------- A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame) """ scaler = StandardScaler() scaler.fit(df) scaled = scaler.transform(df) dbs = DBSCAN() db = dbs.fit(scaled) outliers = dbs.fit_predict(scaled) df_o = df.ix[np.nonzero(outliers)] return db, df_o
def on_squaremsg_received(self, msg): detected_squares = [] for square_msg in msg.squares: detected_squares.append(TrackedSquare.from_msg(square_msg)) self._prev_squares.append(detected_squares) all_squares = list(itertools.chain.from_iterable(self._prev_squares)) square_centers = [list(s.center) + [s.hue] for s in all_squares] data = np.array(square_centers) ms = DBSCAN(eps=64, min_samples=3) ms.fit(data) labels = ms.labels_ ts_msg = TrackedSquares() for i, s in enumerate(all_squares): label = np.int0(labels[i]) if label < 0: continue s.tracking_colour = TrackedSquare.TRACKING_COLOURS[label % len(TrackedSquare.TRACKING_COLOURS)] s.tracking_detected = True ts_msg.squares.append(s.to_msg()) self._squares_pub.publish(ts_msg)
def _cluster(params): cls = None method = sh.getConst('method') if method=='kmedoid': assert False # from kmedoid import kmedsoid # cls = kmedoid elif method=='dbscan': from sklearn.cluster import DBSCAN cls = DBSCAN(eps=params['eps'],min_samples=params['min_samples'], metric='precomputed') else: assert False, 'FATAL: unknown cluster method' ## mat = sh.getConst('mat') labels = cls.fit_predict(mat) nLabels = len(set(labels)) ## sil = None; cal = None if (nLabels >= 2)and(nLabels <= len(labels)-1): sil = met.silhouette_score(mat,labels,'precomputed') cal = met.calinski_harabaz_score(mat,labels) perf = dict(silhouette_score=sil,calinski_harabaz_score=cal) return (labels,perf)
def clusterMalwareNames(malwareNames): # strictly lexical clustering over malware-names wordCount = {} # create a distance matrix matrix = np.zeros((len(malwareNames), len(malwareNames))) for i in range(len(malwareNames)): for j in range(len(malwareNames)): if matrix[i, j] == 0.0: matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j]) matrix[j, i] = matrix[i, j] # Scikit-Learn's DBSCAN implementation to cluster the malware-names clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed") clust.fit(matrix) preds = clust.labels_ clabels = np.unique(preds) # create Word-Count Map for i in range(clabels.shape[0]): if clabels[i] < 0: continue cmem_ids = np.where(preds == clabels[i])[0] cmembers = [] for cmem_id in cmem_ids: cmembers.append(malwareNames[cmem_id]) wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids) return wordCount
def cluster_with_dbscan(vectors, epsilon=0.5, min_samples=5, distances=None, metric="euclidean"): # precomputing our distances will be faster as we can use multiple cores if distances is None: distances = pairwise_distances(vectors, n_jobs=-1, metric=metric) dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric="precomputed") return dbscan.fit_predict(distances)
def cluster_tweets(tweets): #TODO get TFIDF vector #do clustering ner_tags = [get_ner_tags(tweet).tolist() for tweet in tweets['tweet']] vectorizer = TfidfVectorizer(preprocessor=_dummy_preprocess, tokenizer=lambda x:x, binary=True, min_df=0, use_idf=True, smooth_idf=True) tfidf = vectorizer.fit_transform(ner_tags) #ner_tags = [get_ner_tags(tweet) for tweet in tweets['tweet']] print "clustering started" t0 = time() #cluster = AgglomerativeClustering(n_clusters=3, affinity="cosine" ) #cluster = MiniBatchKMeans(n_clusters=10, max_iter=100, batch_size=100) #metric=sklearn.metrics.pairwise.cosine_distances cluster = DBSCAN(min_samples=2, eps=0.5) clustered = cluster.fit(tfidf.todense()) #clustered = cluster.fit(ner_tags) labels = clustered.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print "clustering finished in %.3f seconds"%(time()-t0) print "%d clusters detected"%n_clusters_ tweets['cluster'] = labels tweets['ner'] = ner_tags return tweets
def DBScan_Flux(phots, ycenters, xcenters, dbsClean=0, useTheForce=False): """Class methods are similar to regular functions. Note: Do not include the `self` parameter in the ``Args`` section. Args: param1: The first parameter. param2: The second parameter. Returns: True if successful, False otherwise. """ dbsPhots = DBSCAN()#n_jobs=-1) stdScaler = StandardScaler() phots = np.copy(phots.ravel()) phots[~np.isfinite(phots)] = np.median(phots[np.isfinite(phots)]) featuresNow = np.transpose([stdScaler.fit_transform(ycenters[:,None]).ravel(), \ stdScaler.fit_transform(xcenters[:,None]).ravel(), \ stdScaler.fit_transform(phots[:,None]).ravel() ] ) # print(featuresNow.shape) dbsPhotsPred= dbsPhots.fit_predict(featuresNow) return dbsPhotsPred == dbsClean
def plot_dbscan(): X, y = make_blobs(random_state=0, n_samples=12) dbscan = DBSCAN() clusters = dbscan.fit_predict(X) clusters fig, axes = plt.subplots(3, 4, figsize=(11, 8), subplot_kw={'xticks': (), 'yticks': ()}) # Plot clusters as red, green and blue, and outliers (-1) as white colors = ['r', 'g', 'b'] markers = ['o', '^', 'v'] # iterate over settings of min_samples and eps for i, min_samples in enumerate([2, 3, 5]): for j, eps in enumerate([1, 1.5, 2, 3]): # instantiate DBSCAN with a particular setting dbscan = DBSCAN(min_samples=min_samples, eps=eps) # get cluster assignments clusters = dbscan.fit_predict(X) print("min_samples: %d eps: %f cluster: %s" % (min_samples, eps, clusters)) if np.any(clusters == -1): c = ['w'] + colors m = ['o'] + markers else: c = colors m = markers discrete_scatter(X[:, 0], X[:, 1], clusters, ax=axes[i, j], c=c, s=8, markers=m) inds = dbscan.core_sample_indices_ # vizualize core samples and clusters. if len(inds): discrete_scatter(X[inds, 0], X[inds, 1], clusters[inds], ax=axes[i, j], s=15, c=colors, markers=markers) axes[i, j].set_title("min_samples: %d eps: %.1f" % (min_samples, eps)) fig.tight_layout()
def search_charges(self, data, z=0, threshold = 30): A = deriv(data,z) print 'Searching charges...' time0 = time.time() det = A[3]*A[5]-A[4]**2 dx = -(A[1]*A[5]-A[2]*A[4])/det dy = -(A[2]*A[3]-A[1]*Aa[4])/det datamax = A[0]+A[1]*dx+A[2]*dy+A[3]*dx**2/2+A[4]*dx*dy+A[5]*dy**2/2 t = np.where((np.abs(dx) < 1)*(np.abs(dy) < 1)*(np.abs(datamax) > threshold)*(det > 0)) x = np.array([t[1]+dx[t], t[0]+dy[t]]).T db = DBSCAN(min_samples = 1, eps = 1) db.fit_predict(x) n_charges = np.max(db.labels_)+1 qi = np.zeros(n_charges) xi = np.zeros((3,n_charges)) for i in range(0, n_charges): xi[0:2,i] = np.mean(x[db.labels_ == i,:], axis=0) qi[i] = np.mean(datamax[t][db.labels_ == i]) self.set_charges(qi,xi) print 'Done! Elapsed time: '+str(time.time()-time0) return self
def cluster(): eps_set = 0.5 * np.arange(1, 7) npt_set = np.arange(1, 6) scores = [] global res res = [] for eps in eps_set: for npt in npt_set: est = DBSCAN(eps=eps, min_samples=npt) est.fit(x) ari = metrics.adjusted_rand_score(y, est.labels_) scores.append(ari) n_noise = len([ l for l in est.labels_ if l == -1]) res.append((ari, np.max(est.labels_) + 1 , n_noise)) print ari max_score = np.max(scores) max_idx = scores.index(max_score) max_eps = eps_set[max_idx / len(npt_set)] max_npt = npt_set[max_idx % len(npt_set)] print max_score, max_eps, max_npt scores = np.array(scores).reshape(len(eps_set), len(npt_set)) pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral) pl.colorbar() pl.xticks(np.arange(len(npt_set)), npt_set) pl.yticks(np.arange(len(eps_set)), eps_set) pl.ylabel('eps') pl.xlabel('min_samples') pl.show()
def current_datapoints_dbscan(self): """ Method clusters points-outliers (after current_datapoints_threshold_filter and current_datapoints_outliers_filter) into slice-clusters using DBSCAN. Returns dict of slice-clusters - base for event-candidates. Uses self.eps attribute to estimate cluster boundaries. """ nets = self.current_datapoints.keys() ids = concatenate([self.current_datapoints[x]['ids'] for x in nets]) coords = concatenate([self.current_datapoints[x]['array'] for x in nets]) weights = concatenate([self.current_datapoints[x]['weights'] for x in nets]) if len(ids) > 0: clustering = DBSCAN(eps=self.eps, min_samples=5) labels = clustering.fit_predict(coords) core_ids = ids[clustering.core_sample_indices_] ids = ids[labels > -1] coords = coords[labels > -1] weights = weights[labels > -1] labels = labels[labels > -1] ret_tab = {} for i in range(len(labels)): try: ret_tab[labels[i]].append({'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids}) except KeyError: ret_tab[labels[i]] = [{'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids}] return ret_tab else: return {}
def get_clusters(tracks): neighbors = g.m.neighborsSpin.value() dist = g.m.neighborDistanceSpin.value() data = np.array([[tr['mean_x'], tr['mean_y']] for tr in tracks]) scanner = DBSCAN(eps=dist, min_samples=neighbors) ids = scanner.fit_predict(data) return ids
def cluster_dbscan(matrix, distance_measure="sts", eps=1): """Clusters the distance matrix for a given epsilon value, if distance measure is sts. Other distance measures are: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’, ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] Parameters ---------- matrix: np.matrix The input matrix. If distance measure is sts, this should be the sts distance matrix. If other distance, this should be the time-series matrix of size ngenes x nsamples. distance_measure: str The distance measure, default is sts, short time-series distance. Any distance measure available in scikit-learn is available here. Note: multiple time-series is NOT supported for distances other than "sts". Returns ------- cluster_labels: list of int A list of size ngenes that defines cluster membership. """ if (distance_measure == "sts"): dbs = DBSCAN(eps=eps, metric='precomputed', min_samples=2) else: dbs = DBSCAN(eps=eps, metric=distance_measure, min_samples=2) cluster_labels = dbs.fit_predict(matrix) return cluster_labels
def cluster_mappings(vector_inpath, do_pca=False, target_dim=100, indices_inpath=None, epsilon=2.5, min_s=20): # TODO: CLustering parameters # TODO: Metric cosine similarity or euclidian distance print alt("Load mappings...") indices, model = load_mappings_from_model(vector_inpath) X = numpy.array([model[key] for key in indices]) # del model if do_pca: print alt("Truncate vectors with PCA to %i dimensions..." %(target_dim)) pca = PCA(n_components=target_dim) pca.fit(X) X = pca.transform(X) print alt("Cluster points...") # k = 2 * X[0].shape[0] - 1 # min_pts = k + 1 #dbscan = DBSCAN(eps=0.1, min_samples=20, metric='cosine',algorithm='brute') dbscan = DBSCAN(eps=epsilon, min_samples=min_s) dbscan.fit(X) labels = dbscan.labels_ print get_cluster_size(labels) print alt("Finished clustering!") sscore = silhouette_score(X, labels) print("Silhouette Coefficient: %0.3f" %(sscore)) if indices_inpath: resolve_indices(indices, labels, indices_inpath, model)
def test(): global est est = DBSCAN(eps=1, min_samples=1) est.fit(x) print est.labels_ ari = metrics.adjusted_rand_score(y, est.labels_) print ari
def train_dbscan(): print "starting dbscan clustering..." model = DBSCAN(eps=dbs_eps, min_samples=dbs_min_samples, metric=dbs_metric, algorithm='auto') model.fit(X) core_ponts = model.core_sample_indices_ if output_core_points: print "core points data index" print core_points print "num of core points %d" %(len(core_ponts)) print "all points clutser index" cluster_index = model.labels_ if output_cluster_members: #print cluster_index cluster_members = {} for i,c in enumerate(cluster_index): index_list = cluster_members.get(c, list()) index_list.append(i) cluster_members[c] = index_list for cl, indx_list in cluster_members.iteritems(): if cl > 0: print "cluster index %d size %d" %(cl, len(indx_list)) else: print "noise points size %d" %(len(indx_list)) print indx_list print "num of clusters %d" %(cluster_index.max() + 1)
def cluster_DBSCAN(args): """ Clustering with Ward hierarchical clustering: constructs a tree and cuts it. """ #load data g_it = node_link_data.node_link_data_to_eden(input = args.input_file, input_type = "file") vec = graph.Vectorizer(r = args.radius,d = args.distance, nbits = args.nbits) logger.info('Vectorizer: %s' % vec) X = vec.transform(g_it, n_jobs = args.n_jobs) logger.info('Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz() / X.shape[0])) #project to lower dimensional space to use clustering algorithms transformer = TruncatedSVD(n_components=args.n_components) X_dense=transformer.fit_transform(X) #log statistics on data logger.info('Dimensionality reduction Instances: %d Features: %d with an avg of %d features per instance' % (X_dense.shape[0], X_dense.shape[1], X.getnnz() / X.shape[0])) #clustering clustering_algo = DBSCAN(eps = args.eps) y = clustering_algo.fit_predict(X_dense) msg = 'Predictions statistics: ' msg += util.report_base_statistics(y) logger.info(msg) #save model for vectorizer out_file_name = "vectorizer" eden_io.dump(vec, output_dir_path = args.output_dir_path, out_file_name = out_file_name) logger.info("Written file: %s/%s",args.output_dir_path, out_file_name) #save result out_file_name = "labels" eden_io.store_matrix(matrix = y, output_dir_path = args.output_dir_path, out_file_name = out_file_name, output_format = "text") logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
def find_tracks(data, eps=20, min_samples=20): """Applies the DBSCAN algorithm from scikit-learn to find tracks in the data. Parameters ---------- data : array-like An array of (x, y, z, hits) data points eps : number, optional The minimum distance between adjacent points in a cluster min_samples : number, optional The min number of points in a cluster Returns ------- tracks : list A list of tracks. Each track is an ndarray of points. """ xyz = data[:, 0:3] dbs = DBSCAN(eps=eps, min_samples=min_samples) dbs.fit(xyz) tracks = [] for track in (np.where(dbs.labels_ == n)[0] for n in np.unique(dbs.labels_) if n != -1): tracks.append(data[track]) return tracks
def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time): BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X] labels = None if clusterType == 'kmeans': kmeans = KMeans(n_clusters=N_CLUSTERS) kmeans.fit(data) labels = kmeans.labels_ elif clusterType == 'affinity_propagation': ap = AffinityPropagation(damping=0.75) ap.fit(data) labels = ap.labels_ N_CLUSTERS = np.max(self.labels)+1 elif clusterType == 'DBSCAN': dbscan = DBSCAN() dbscan.fit(data) labels = dbscan.labels_ N_CLUSTERS = np.max(labels)+1 print 'N_CLUSTERS=' + str(N_CLUSTERS) elif clusterType == 'AgglomerativeClustering': ac = AgglomerativeClustering(n_clusters=N_CLUSTERS) ac.fit(data) labels = ac.labels_ else: print 'ERROR: clusterType: ' + clusterType + ' is not recognized' return (labels, N_CLUSTERS)
def dbscan(similarity, concepts=2, euclid=False): if euclid: model = DBSCAN(eps=0.6, min_samples=10, algorithm='auto', leaf_size=30) return model.fit_predict(similarity) else: model = DBSCAN(eps=0.6, min_samples=10, metric='precomputed', algorithm='auto', leaf_size=30) return model.fit_predict(1 - similarity)
def cluster_lvl1(self, data): db = DBSCAN(eps=2., min_samples=2, metric='precomputed') processed = np.float32(np.vstack([ np.mgrid[:self.map_height, :self.map_width].reshape(2, -1), data.ravel() ])).T dist = self.distances_for_lvl1(processed) return db.fit_predict(dist).reshape(self.map_height, self.map_width)
def regroup(self, maxdistance, minsize, algo = 'auto'): self.__loginfo('Regrouping') dbsfit = DBSCAN(eps=maxdistance, min_samples=minsize, algorithm=algo).fit(self.primarylist) dbsresult = dbsfit.fit_predict(self.primarylist) grouplist = [] for grouplabel in dbsresult: if not grouplabel in grouplist: grouplist.append(grouplabel) self.__loginfo('Group label count: %s' % len(grouplist))
def cluster_dbscan(self, calpha=False, cluster_diameter=6, cluster_min_size=10): ''' cluster the residues using the DBSCAN method. The parameters here are neighborhood diameter (eps) and neighborhood connectivity (min_samples). Returns a list of cluster labels, in which label ``-1`` means an outlier point, which doesn't belong to any cluster. ''' if not self.positive_residues: return {} if calpha: data_atoms = self.positive_residues.select('ca') else: data_atoms = self.positive_residues.select('sidechain or ca').copy() assert ( data_atoms.getHierView().numResidues() == self.positive_residues.getHierView().numResidues() ) OUTLIER_LABEL = -1 db_clust = DBSCAN(eps=cluster_diameter, min_samples=cluster_min_size) db_clust.fit(data_atoms.getCoords()) db_labels = db_clust.labels_.astype(int) #print db_labels, len(db_labels) if calpha: residue_labels = db_labels else: residues = list(data_atoms.getHierView().iterResidues()) residue_labels = np.zeros(len(residues), dtype=int) def most_common(lst): lst = list(lst) return max(set(lst) or [OUTLIER_LABEL], key=lst.count) data_atoms.setBetas(db_labels) for i, res in enumerate(residues): atom_labels = res.getBetas() residue_labels[i] = most_common(atom_labels[atom_labels!=OUTLIER_LABEL]) assert len(residue_labels) == self.positive_residues.getHierView().numResidues() residue_numbers = self.positive_residues.ca.getResnums() clusters = sorted( [residue_numbers[residue_labels==i] for i in set(residue_labels) if i!=-1], key=self.conf_sum, reverse=True, ) return dict(enumerate(clusters))
def main(datafile, feature1, feature2, normalize, clusteroutput, percentile, copula): X, features = read_sah_h5(datafile, just_good=False) if 'id' not in features: ids = np.arange(len(X)) else: ids = X[:, features.index('id')] x = X[:, features.index(feature1)] y = X[:, features.index(feature2)] D = np.column_stack([x, y]) idx = np.random.randint(len(X), size=10000) D = D[idx] ids = ids[idx] if normalize: mean = np.average(D, axis=0) std = np.std(D, axis=0) std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs Dnorm = (D - mean) / std elif copula: Dnorm = np.column_stack([copula_transform(f) for f in D.T]) else: Dnorm = D kmeans = MiniBatchKMeans(n_clusters=50) gmm = GMM(n_components=200, covariance_type='full', verbose=True) #C = gmm.fit_predict(Dnorm) dbscan = DBSCAN(eps=100.0, min_samples=1) C = dbscan.fit_predict(Dnorm) print C with open(clusteroutput, 'w+') as f: for c, i in zip(C, ids): f.write('%d,%d\n' % (i, c)) pl.scatter(D[:, 0], D[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C))) # for c in np.unique(C): # pl.bar(0, 0, lw=0, ec='none', # fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c) # pl.legend(loc='upper left') if percentile > 0: pl.xlim( scoreatpercentile(x, percentile), scoreatpercentile(x, 100-percentile) ) pl.ylim( scoreatpercentile(y, percentile), scoreatpercentile(y, 100-percentile) ) pl.xlabel(feature1) pl.ylabel(feature2) pl.show()
def dbscan(self, eps=0.75, min_samples=3): """ :param kwargs: key-value arguments to pass to DBSCAN (eps: max dist between points in same neighbourhood, min_samples: number of points in a neighbourhood) :return: """ est = DBSCAN(metric='precomputed', eps=eps, min_samples=min_samples) est.fit(self.get_dm(False)) return Partition(est.labels_)
def fit(fvecs, params): eps_ = int(params[0]) min_s = int(params[1]) metric_=params[2] # affinity : “euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or ‘precomputed’ model = DBSCAN(eps=eps_, min_samples=min_s, metric=metric_) model.fit(fvecs) print len(set(model.labels_)) return model.labels_
def mode_cluster(mode,eps,sam): mode_change_pnts=[] # print(tran_mat) query = {"$and": [{'type': 'move'},\ {'confirmed_mode':mode}]} # print(Sections.find(query).count()) logging.debug("Trying to find cluster locations for %s trips" % (Sections.find(query).count())) for section in Sections.find(query).sort("section_start_datetime",1): try: mode_change_pnts.append(section['section_start_point']['coordinates']) mode_change_pnts.append(section['section_end_point']['coordinates']) except: logging.warn("Found trip %s with missing start and/or end points" % (section['_id'])) pass # print(user_change_pnts) # print(len(mode_change_pnts)) if len(mode_change_pnts) == 0: logging.debug("No points found in cluster input, nothing to fit..") return np.zeros(0) if len(mode_change_pnts)>=1: # print(mode_change_pnts) np_points=np.array(mode_change_pnts) # print(np_points[:,0]) # fig, axes = plt.subplots(1, 1) # axes.scatter(np_points[:,0], np_points[:,1]) # plt.show() else: pass utm_x = [] utm_y = [] for row in mode_change_pnts: # GEOJSON order is lng, lat utm_loc = utm.from_latlon(row[1],row[0]) utm_x = np.append(utm_x,utm_loc[0]) utm_y = np.append(utm_y,utm_loc[1]) utm_location = np.column_stack((utm_x,utm_y)) db = DBSCAN(eps=eps,min_samples=sam) db_fit = db.fit(utm_location) db_labels = db_fit.labels_ #print db_labels new_db_labels = db_labels[db_labels!=-1] new_location = np_points[db_labels!=-1] # print len(new_db_labels) # print len(new_location) # print new_information label_unique = np.unique(new_db_labels) cluster_center = np.zeros((len(label_unique),2)) for label in label_unique: sub_location = new_location[new_db_labels==label] temp_center = np.mean(sub_location,axis=0) cluster_center[int(label)] = temp_center # print cluster_center return cluster_center
def done(self): matrix = [[0]*self.count for i in range(self.count)] for keys,distance in self.matrixDict.iteritems(): matrix[keys[0]][keys[1]] = distance matrix[keys[1]][keys[0]] = distance db = DBSCAN(eps=args.epsilon, metric='precomputed', min_samples=args.min_samples) output = db.fit(matrix) for label,i in self.labelToPos.iteritems(): args.outfile.write(self.tup + [label, output.labels_[i]])
"Import libraries" import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import DBSCAN from sklearn import datasets from sklearn.decomposition import PCA "Import Datasets" #data = pd.DataFrame(datasets.load_iris().data) #y = list(datasets.load_iris().target) data = pd.read_csv("glass.csv", sep=",") data.drop(['Type'], axis=1, inplace=True) "Apply DBSCAN model and its parameters" model = DBSCAN(eps=0.5, min_samples=5, metric="euclidean", leaf_size=30) "Fit data in given model" model.fit(data) "classified clusters" model.labels_ "PCA decomposition for plotting" pca = PCA(n_components=2).fit(data) pca_2D = pca.transform(data) "Plot clusters and Noise" for i in np.arange(pca_2D.shape[0]): if model.labels_[i] == 0: c1 = plt.scatter(pca_2D[i, 0], pca_2D[i, 1], c="r", marker='+')
def main_worker(args): global start_epoch, best_mAP cudnn.benchmark = True sys.stdout = Logger(osp.join(args.logs_dir, 'log.txt')) print("==========\nArgs:{}\n==========".format(args)) # Create data loaders iters = args.iters if (args.iters > 0) else None dataset_target = get_data(args.dataset_target, args.data_dir) ori_train = dataset_target.train if not args.no_source: dataset_source = get_data(args.dataset_source, args.data_dir) test_loader_target = get_test_loader(dataset_target, args.height, args.width, args.batch_size, args.workers) # Create model model_1, model_1_ema = create_model(args) # Evaluator evaluator_1_ema = Evaluator(model_1_ema) best_mAP = 0 for nc in range(args.epochs): cluster_loader = get_test_loader(dataset_target, args.height, args.width, args.batch_size, args.workers, testset=dataset_target.train) dict_f, _ = extract_features(model_1_ema, cluster_loader, print_freq=50) cf_1 = torch.stack(list(dict_f.values())) if not args.no_source: cluster_loader_source = get_test_loader( dataset_source, args.height, args.width, args.batch_size, args.workers, testset=dataset_source.train) dict_f_source, _ = extract_features(model_1_ema, cluster_loader_source, print_freq=50) cf_1_source = torch.stack(list(dict_f_source.values())) # DBSCAN cluster if args.no_source: rerank_dist = compute_jaccard_dist(cf_1, lambda_value=0, source_features=None, use_gpu=False).numpy() else: rerank_dist = compute_jaccard_dist(cf_1, lambda_value=args.lambda_value, source_features=cf_1_source, use_gpu=False).numpy() tri_mat = np.triu(rerank_dist, 1) # tri_mat.dim=2 tri_mat = tri_mat[np.nonzero(tri_mat)] # tri_mat.dim=1 tri_mat = np.sort(tri_mat, axis=None) top_num = np.round(args.rho * tri_mat.size).astype(int) eps = tri_mat[:top_num].mean() print('eps in cluster: {:.3f}'.format(eps)) print('Clustering and labeling...') cluster = DBSCAN(eps=eps, min_samples=4, metric='precomputed', n_jobs=-1) labels = cluster.fit_predict(rerank_dist) num_ids = len(set(labels)) - 1 print('Epoch {} have {} training ids'.format(nc, num_ids)) # generate new dataset labeled_ind, unlabeled_ind = [], [] for ind, label in enumerate(labels): if label == -1: unlabeled_ind.append(ind) else: labeled_ind.append(ind) # print('Epoch {} have {} labeled samples and {} unlabeled samples'.format(nc + 1, len(labeled_ind), len(unlabeled_ind))) cf_1 = cf_1.numpy() centers = [] for id in range(num_ids): centers.append(np.mean(cf_1[labels == id], axis=0)) centers = np.stack(centers, axis=0) # print(centers.shape) if args.features == 0: model_1.module.classifier = nn.Linear(2048, num_ids, bias=False).cuda() model_1_ema.module.classifier = nn.Linear(2048, num_ids, bias=False).cuda() model_1.module.classifier_max = nn.Linear(2048, num_ids, bias=False).cuda() model_1_ema.module.classifier_max = nn.Linear(2048, num_ids, bias=False).cuda() model_1.module.classifier.weight.data.copy_( torch.from_numpy(normalize(centers[:, :2048], axis=1)).float().cuda()) model_1_ema.module.classifier.weight.data.copy_( torch.from_numpy(normalize(centers[:, :2048], axis=1)).float().cuda()) model_1.module.classifier_max.weight.data.copy_( torch.from_numpy(normalize(centers[:, 2048:], axis=1)).float().cuda()) model_1_ema.module.classifier_max.weight.data.copy_( torch.from_numpy(normalize(centers[:, 2048:], axis=1)).float().cuda()) else: model_1.module.classifier = nn.Linear(1024, num_ids, bias=False).cuda() model_1_ema.module.classifier = nn.Linear(1024, num_ids, bias=False).cuda() model_1.module.classifier_max = nn.Linear(1024, num_ids, bias=False).cuda() model_1_ema.module.classifier_max = nn.Linear(1024, num_ids, bias=False).cuda() model_1.module.classifier.weight.data.copy_( torch.from_numpy(normalize(centers[:, :1024], axis=1)).float().cuda()) model_1_ema.module.classifier.weight.data.copy_( torch.from_numpy(normalize(centers[:, :1024], axis=1)).float().cuda()) model_1.module.classifier_max.weight.data.copy_( torch.from_numpy(normalize(centers[:, 1024:], axis=1)).float().cuda()) model_1_ema.module.classifier_max.weight.data.copy_( torch.from_numpy(normalize(centers[:, 1024:], axis=1)).float().cuda()) target_label = labels for i in range(len(dataset_target.train)): dataset_target.train[i] = list(dataset_target.train[i]) dataset_target.train[i][1] = int(target_label[i]) dataset_target.train[i] = tuple(dataset_target.train[i]) # Optimizer params = [] for key, value in model_1.named_parameters(): if not value.requires_grad: continue params += [{ "params": [value], "lr": args.lr, "weight_decay": args.weight_decay }] optimizer = torch.optim.Adam(params) # Trainer trainer = ABMTTrainer(model_1, model_1_ema, num_cluster=num_ids, alpha=args.alpha) epoch = nc # # DBSCAN dataset_target.train = [ori_train[i] for i in labeled_ind] print(len(dataset_target.train), 'are labeled.') labeled_loader_target = get_train_loader(dataset_target, args.height, args.width, args.batch_size, args.workers, args.num_instances, iters, mutual=True) labeled_loader_target.new_epoch() trainer.train(epoch, labeled_loader_target, optimizer, print_freq=args.print_freq, train_iters=len(labeled_loader_target)) def save_model(model_ema, is_best, best_mAP, mid): save_checkpoint( { 'state_dict': model_ema.state_dict(), 'epoch': epoch + 1, 'best_mAP': best_mAP, }, is_best, fpath=osp.join(args.logs_dir, 'model' + str(mid) + '_checkpoint.pth.tar')) if ((epoch + 1) % args.eval_step == 0 or (epoch == args.epochs - 1)): print('Evaluating teacher net:') cmc, mAP_1 = evaluator_1_ema.evaluate(test_loader_target, dataset_target.query, dataset_target.gallery, cmc_flag=True) is_best = (mAP_1 > best_mAP) best_mAP = max(mAP_1, best_mAP) save_model(model_1_ema, is_best, best_mAP, 1) dataset_target.train = ori_train print('Test on the best model.') checkpoint = load_checkpoint(osp.join(args.logs_dir, 'model_best.pth.tar')) model_1_ema.load_state_dict(checkpoint['state_dict']) evaluator_1_ema.evaluate(test_loader_target, dataset_target.query, dataset_target.gallery, cmc_flag=True)
def dbscan_cluster(data: np.ndarray, eps: float = 0.05, min_samples: int = 5) -> np.ndarray: return DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X=data)
def test(file_name): data_frame = pd.read_csv('/home/mytrah-pc/Mytrah_Adithya/data_turbine/' + file_name) num_rows = data_frame.shape[0] filter_data_frame = data_frame.copy()[['ActivePower', 'WindSpeed']] filter_data_frame['set_in'] = -2 min_active_power = filter_data_frame['ActivePower'].min() max_active_power = filter_data_frame['ActivePower'].max() min_wind_speed = filter_data_frame['WindSpeed'].min() max_wind_speed = filter_data_frame['WindSpeed'].max() global_max_p = max_active_power global_min_p = min_active_power """ Subract all active power by min_active_power Subract all wind speed by min_wind_speed Divide all active power by max_active_power - min_active_power Divide all wind speed by max_wind_speed - min_wind_speed """ filter_data_frame['ActivePowerScaled'] = ( (filter_data_frame['ActivePower'] - min_active_power) * 15) / (max_active_power - min_active_power) filter_data_frame['WindSpeedScaled'] = ( (filter_data_frame['WindSpeed'] - min_wind_speed) * 20) / (max_wind_speed - min_wind_speed) scan = DBSCAN(eps=0.28, min_samples=15).fit_predict( filter_data_frame[['ActivePowerScaled', 'WindSpeedScaled']]) filter_data_frame['set_in'] = scan import random r = lambda: random.randint(0, 255) import matplotlib.pyplot as plt plt.figure(figsize=(10, 10)) num_of_groups = 0 static_compare = -1 static_max = -2 for group in filter_data_frame.groupby('set_in'): num_of_groups = num_of_groups + 1 if len(group[1]) > static_compare: static_compare = len(group[1]) static_max = group[0] plt.scatter(group[1]['WindSpeed'], group[1]['ActivePower'], s=np.pi * 2 * 2, c='#c0c0c0') loop_list = list(range(9) - np.ones(9)) del loop_list[loop_list.index(static_max)] temp_frame = pd.concat([ filter_data_frame[filter_data_frame['set_in'] == i] for i in loop_list ]) data_frame = temp_frame[(temp_frame['ActivePower'] < (global_max_p) * 0.8) & (temp_frame['ActivePower'] > (global_max_p) * 0.2)] num_rows = data_frame.shape[0] filter_data_frame = data_frame.copy()[['ActivePower', 'WindSpeed']] filter_data_frame['set_in'] = -2 min_active_power = filter_data_frame['ActivePower'].min() max_active_power = filter_data_frame['ActivePower'].max() min_wind_speed = filter_data_frame['WindSpeed'].min() max_wind_speed = filter_data_frame['WindSpeed'].max() """ Subract all active power by min_active_power Subract all wind speed by min_wind_speed Divide all active power by max_active_power - min_active_power Divide all wind speed by max_wind_speed - min_wind_speed """ filter_data_frame['ActivePowerScaled'] = ( (filter_data_frame['ActivePower'] - min_active_power) * 250) / (max_active_power - min_active_power) filter_data_frame['WindSpeedScaled'] = ( (filter_data_frame['WindSpeed'] - min_wind_speed) * 20) / (max_wind_speed - min_wind_speed) scan = DBSCAN(eps=2.5, min_samples=15).fit_predict( filter_data_frame[['ActivePowerScaled', 'WindSpeedScaled']]) filter_data_frame['set_in'] = scan import random r = lambda: random.randint(0, 255) num_of_groups = 0 static_compare = -1 static_max = -2 for group in filter_data_frame.groupby('set_in'): num_of_groups = num_of_groups + 1 if len(group[1]) > static_compare: static_compare = len(group[1]) static_max = group[0] if (group[0] == -1): continue plt.scatter( group[1]['WindSpeed'], group[1]['ActivePower'], s=np.pi * 2 * 2, c='#000000' #'#%02X%02X%02X' % (r(),r(),r()) ) plt.show()
def plot_cluster_map_multi( df_filtered_dict, # quantile_group, range_axis, data_column_i, OUTPUT_CHARTS_DIR, number_to_name_dict={}, eps=4, min_samples=10, save_plot=True): # print('start_cluster_map') import numpy as np import pandas as pd import os from sklearn.cluster import DBSCAN from sklearn import metrics from sklearn.preprocessing import StandardScaler quantile_group = df_filtered_dict['quantile_group'] range_low = df_filtered_dict['range_low'] range_high = df_filtered_dict['range_high'] df_filtered = df_filtered_dict['df_filtered'] X = df_filtered[['X', 'Y']].values # ############################################################################# # Compute DBSCAN # print(range_low) db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) # print(range_high) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) n_clusters_points_ = len(X) - n_noise_ if n_clusters_ > 0: print('Data Column Name: %s' % data_column_i) print('Quantile Group: %s' % quantile_group) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of clusters points: %d' % n_clusters_points_) print('Estimated number of noise points: %d' % n_noise_) print('Range between : %s and %s' % (range_low, range_high)) # ############################################################################# # Plot result if save_plot: chart_name = data_column_i print('chartname {0}'.format(chart_name)) if bool(number_to_name_dict): print('dict exists') if str(data_column_i) in number_to_name_dict: print('keyexists') chart_name = number_to_name_dict[str(data_column_i)] print('chartname after : {0}'.format(chart_name)) import matplotlib.pyplot as plt # Black removed and is used for noise instead. unique_labels = set(labels) colors = [ plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels)) ] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], '.', color=tuple(col), markersize=3) # xy = X[class_member_mask & ~core_samples_mask] # plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), # markeredgecolor='k', markersize=1) plt.title('Between %s and %s : %d clusters and % d points' % (round(range_low, 3), round( range_high, 3), n_clusters_, n_clusters_points_)) plt.axis(range_axis) F = plt.gcf() F.savefig(os.path.join( OUTPUT_CHARTS_DIR, '{0}_quantile_group_{1}.png'.format(chart_name, quantile_group)), dpi=(500)) plt.show() plt.clf() df_filtered['CLUSTER'] = labels df_filtered = df_filtered.loc[df_filtered['CLUSTER'] != -1] df_filtered['CLUSTER_COUNT'] = df_filtered.groupby( 'CLUSTER')['CLUSTER'].transform('count') df_filtered['PERCENTILE'] = [range_low] * len(df_filtered.index) return df_filtered
def getClusters(positions, distanceKM, min_samples=5): """ Returns the clusters from the points based on provided data to no. of clusters based on DBScan Algorithm Parameters ---------- positions : Geodataframe object Geodataframe with positions to be clustered distanceKM : Float Epsilon parameters fo dbscan algorithm in km. or, distance for clustering of points min_samples : Integer, optional DESCRIPTION. Minimum no. of points required to form cluster. If 1 is set,each individual will form their own cluster The default is 5. Returns ------- Dataframe The dataframe with cluster centres co-ordinates and no. of points on the cluster. """ def get_centermost_point(cluster): centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y) centermost_point = min( cluster, key=lambda point: great_circle(point, centroid).m) return tuple(centermost_point) df = positions.to_crs({'init': 'epsg:4326'}) lon = df.geometry.x lat = df.geometry.y origin_pt = pd.DataFrame() # Populate lat lon to dataframe origin_pt['lat'] = lat origin_pt['lon'] = lon # add index to data coords = origin_pt.to_numpy() origin_pt.index = [i for i in range(len(lat))] # # Convert Data to projected and perform clustering kms_per_radian = 6371.0088 epsilon = distanceKM / kms_per_radian db = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm='ball_tree', metric='haversine').fit(np.radians(coords)) cluster_labels = db.labels_ validClusters = [] for cluster in cluster_labels: if cluster != -1: validClusters.append(cluster) num_clusters = len(set(validClusters)) clusters = pd.Series( [coords[cluster_labels == n] for n in range(num_clusters)]) # Assigining clusterId to each point origin_pt['clusterId'] = cluster_labels # Identify cluster Centres centermost_points = clusters.map(get_centermost_point) # Create Geodataframe with attributes for cluster centroids clusterId = [i for i in range(len(centermost_points))] centroidLat = [ centermost_points[i][0] for i in range(len(centermost_points)) ] centroidLon = [ centermost_points[i][1] for i in range(len(centermost_points)) ] clusterSize = [ len(origin_pt[origin_pt['clusterId'] == i]) for i in range(len(centermost_points)) ] # Create dataframe for cluster centers clusterCentres_df = pd.DataFrame({ 'clusterId': clusterId, 'clusterLat': centroidLat, 'clusterLon': centroidLon, 'clusterSize': clusterSize }) clusterCentres = gpd.GeoDataFrame(clusterCentres_df, geometry=gpd.points_from_xy( clusterCentres_df.clusterLon, clusterCentres_df.clusterLat)) return clusterCentres
def dbscan_dados(self, topics, epsilon=0.5, num_topics=5, n_words=15): X = self.topics_to_vectorspace(topics, num_topics=num_topics, n_words=n_words) clustering = DBSCAN(eps=epsilon).fit(X.toarray()) return clustering.labels_
""" from sklearn.cluster import DBSCAN import numpy as np import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv('week4event.csv',header=None) X = df.to_numpy() X = X[:20000,[2, 6]] db = DBSCAN(eps=3, min_samples=2).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) ########## unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise.
X_scaled = scaler.transform(X) fig, axes = plt.subplots(1, 4, figsize=(15, 3), subplot_kw={ 'xticks': (), 'yticks': () }) plt.subplots_adjust(left=0.05, right=0.95) # 列出要使用的算法 algorithms = [ KMeans(n_clusters=2), AgglomerativeClustering(n_clusters=2), DBSCAN() ] # 创建一个随机的簇分配,作为参考 random_state = np.random.RandomState(seed=0) random_clusters = random_state.randint(low=0, high=2, size=len(X)) # 绘制随机分配 axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], c=random_clusters, cmap=mglearn.cm3, s=60) axes[0].set_title("Random assignment - ARI: {:.2f}".format( adjusted_rand_score(y, random_clusters)))
def cluster_torsions_DBSCAN(file_list, cgmodel, min_samples=5, eps=0.5, frame_start=0, frame_stride=1, frame_end=-1, output_format="pdb", output_dir="cluster_output", backbone_torsion_type="bb_bb_bb_bb", core_points_only=True, filter=True, filter_ratio=0.25, plot_silhouette=True, plot_distance_hist=True): """ Given PDB or DCD trajectory files and coarse grained model as input, this function performs DBSCAN clustering on the poses in the trajectory, and returns a list of the coordinates for the medoid pose of each cluster. :param file_list: A list of PDB or DCD files to read and concatenate :type file_list: List( str ) :param cgmodel: A CGModel() class object :type cgmodel: class :param min_samples: minimum of number of samples in neighborhood of a point to be considered a core point (includes point itself) :type min_samples: int :param eps: DBSCAN parameter neighborhood distance cutoff :type eps: float :param frame_start: First frame in trajectory file to use for clustering. :type frame_start: int :param frame_stride: Advance by this many frames when reading trajectories. :type frame_stride: int :param frame_end: Last frame in trajectory file to use for clustering. :type frame_end: int :param output_format: file format extension to write medoid coordinates to (default="pdb"), dcd also supported :type output_format: str :param output_dir: directory to write clustering medoid and plot files :type output_dir: str :param backbone_torsion_type: particle sequence of the backbone torsions (default="bb_bb_bb_bb") - for now only single sequence permitted :type backbone_torsion_type: str :param core_points_only: use only core points to calculate medoid structures (default=True) :type core_points_only: boolean :param filter: option to apply neighborhood radius filtering to remove low-density data (default=True) :type filter: boolean :param filter_ratio: fraction of data points which pass through the neighborhood radius filter (default=0.05) :type filter_ratio: float :param plot_silhouette: option to create silhouette plot of clustering results (default=True) :type plot_silhouette: boolean :param plot_torsion_hist: option to plot a histogram of torsion euclidean distances (post-filtering) :type plot_torsion_hist: boolean :returns: - medoid_positions ( np.array( float * unit.angstrom ( n_clusters x num_particles x 3 ) ) ) - A 3D numpy array of poses corresponding to the medoids of all trajectory clusters. - medoid torsions ( np.array ( float * unit.degrees ( n_clusters x n_torsion ) - A 2D numpy array of the backbone torsion angles for each cluster medoid - cluster_sizes ( List ( int ) ) - A list of number of members in each cluster - cluster_rmsd( np.array ( float ) ) - A 1D numpy array of rmsd (in cluster distance space) of samples to cluster centers - n_noise ( int ) - number of points classified as noise - silhouette_avg - ( float ) - average silhouette score across all clusters """ if not os.path.exists(output_dir): os.mkdir(output_dir) torsion_val_array, traj_all = get_torsion_matrix(file_list, cgmodel, frame_start, frame_stride, frame_end, backbone_torsion_type) # We need to precompute the euclidean distance matrix, accounting for periodic boundaries total = 0 angle_range = np.full(torsion_val_array.shape[1], 360) powers = np.full(torsion_val_array.shape[1], 2) torsion_distances = np.zeros( (torsion_val_array.shape[0], torsion_val_array.shape[0])) for i in range(torsion_val_array.shape[0]): for j in range(torsion_val_array.shape[0]): delta = np.abs(torsion_val_array[i, :] - torsion_val_array[j, :]) delta = np.where(delta > 0.5 * angle_range, delta - angle_range, delta) torsion_distances[i, j] = np.sqrt(np.power(delta, powers).sum()) if filter: # Filter distances: torsion_distances, dense_indices, filter_ratio_actual = \ filter_distances(torsion_distances, filter_ratio=filter_ratio) traj_all = traj_all[dense_indices] if plot_distance_hist: distances_row = np.reshape( torsion_distances, (torsion_distances.shape[0] * torsion_distances.shape[1], 1)) # Remove the diagonal 0 elements: distances_row = distances_row[distances_row != 0] figure = plt.figure() n_out, bin_edges_out, patch = plt.hist(distances_row, bins=1000, density=True) plt.xlabel('rmsd') plt.ylabel('probability density') plt.savefig(f'{output_dir}/torsion_distances_hist.pdf') plt.close() # Cluster with sklearn DBSCAN dbscan = DBSCAN(min_samples=min_samples, eps=eps, metric='precomputed').fit(torsion_distances) # The produces a cluster labels from 0 to n_clusters-1, and assigns -1 to noise points # Get labels labels = dbscan.labels_ # Get core sample indices: core_sample_indices = dbscan.core_sample_indices_ # Number of clusters: n_clusters = len(set(labels)) - (1 if -1 in labels else 0) # Number of noise points: n_noise = list(labels).count(-1) # Get indices of frames in each cluster: cluster_indices = {} cluster_indices_core = {} cluster_sizes = [] cluster_sizes_core = [] for k in range(n_clusters): cluster_indices[k] = np.argwhere(labels == k)[:, 0] cluster_indices_core[k] = [] for elem in cluster_indices[k]: if elem in core_sample_indices: cluster_indices_core[k].append(elem) cluster_sizes.append(len(cluster_indices[k])) cluster_sizes_core.append(len(cluster_indices_core[k])) # Get indices of frames classified as noise: noise_indices = np.argwhere(labels == -1)[:, 0] # Find the structure closest to each center (medoid): # OPTICS/DBSCAN does not have a built-in function to transform to cluster-distance space, # as the centroids of the clusters are not physically meaningful in general. However, as # RMSD between structures is our only clustering feature, the cluster centers (regions of # high density) will likely be representative structures of each cluster. # Following the protocol outlined in MDTraj example: # http://mdtraj.org/1.9.3/examples/centroids.html # Create distance matrices within each cluster: torsion_distances_k = {} if core_points_only: for k in range(n_clusters): torsion_distances_k[k] = np.zeros( (cluster_sizes_core[k], cluster_sizes_core[k])) for i in range(cluster_sizes_core[k]): for j in range(cluster_sizes_core[k]): torsion_distances_k[k][i, j] = torsion_distances[ cluster_indices_core[k][i], cluster_indices_core[k][j]] # Compute medoid based on similarity scores: medoid_index = [] # Global index intra_cluster_medoid_index = [] # Index within cluster for k in range(n_clusters): intra_cluster_medoid_index.append( np.exp(-torsion_distances_k[k] / torsion_distances_k[k].std()).sum(axis=1).argmax()) # Here we need to use the global sample index to find the medoid structure: medoid_index.append( cluster_indices_core[k][intra_cluster_medoid_index[k]]) else: for k in range(n_clusters): torsion_distances_k[k] = np.zeros( (cluster_sizes[k], cluster_sizes[k])) for i in range(cluster_sizes[k]): for j in range(cluster_sizes[k]): torsion_distances_k[k][i, j] = torsion_distances[ cluster_indices[k][i], cluster_indices[k][j]] # Compute medoid based on similarity scores: medoid_index = [] # Global index intra_cluster_medoid_index = [] # Index within cluster for k in range(n_clusters): intra_cluster_medoid_index.append( np.exp(-torsion_distances_k[k] / torsion_distances_k[k].std()).sum(axis=1).argmax()) # Here we need to use the global sample index to find the medoid structure: medoid_index.append( cluster_indices[k][intra_cluster_medoid_index[k]]) medoid_xyz = np.zeros([n_clusters, traj_all.n_atoms, 3]) for k in range(n_clusters): medoid_xyz[k, :, :] = traj_all[medoid_index[k]].xyz[0] # Write medoids to file write_medoids_to_file(cgmodel, medoid_xyz, output_dir, output_format) medoid_positions = medoid_xyz * unit.nanometer # Get medoid torsions: medoid_torsions = np.zeros([n_clusters, torsion_val_array.shape[1]]) for k in range(n_clusters): medoid_torsions[k, :] = torsion_val_array[medoid_index[k], :] # Compute intra-cluster rmsd of samples to medoid based on structure rmsd cluster_rmsd = np.zeros(n_clusters) for k in range(n_clusters): cluster_rmsd[k] = np.sqrt(( (torsion_distances_k[k][intra_cluster_medoid_index[k]]**2).sum()) / len(cluster_indices[k])) # Get silhouette scores try: silhouette_sample_values = silhouette_samples(torsion_distances, labels) silhouette_avg = np.mean(silhouette_sample_values[labels != -1]) if plot_silhouette: # Plot silhouette analysis plotfile = f"{output_dir}/silhouette_dbscan_min_sample_{min_samples}_eps_{eps}.pdf" make_silhouette_plot(dbscan, silhouette_sample_values, silhouette_avg, n_clusters, cluster_rmsd, cluster_sizes, plotfile) except ValueError: print( "There are either no clusters, or no noise points identified. Try adjusting DBSCAN min_samples, eps parameters." ) silhouette_avg = None return medoid_positions, medoid_torsions, cluster_sizes, cluster_rmsd, n_noise, silhouette_avg
knn.fit(X_train, y_train) y_pred_class = knn.predict(X_test) from sklearn import metrics print metrics.accuracy_score(y_test, y_pred_class) # KNN accuracy on scaled data knn.fit(X_train_scaled, y_train) y_pred_class = knn.predict(X_test_scaled) print metrics.accuracy_score(y_test, y_pred_class) ''' DB Scan Clustering ''' # DBSCAN with eps=1 and min_samples=3 from sklearn.cluster import DBSCAN db = DBSCAN(eps=1, min_samples=3) db.fit(X_scaled) # review the cluster labels db.labels_ # save the cluster labels and sort by cluster NHL['cluster'] = db.labels_ NHL.sort('cluster') # review the cluster centers NHL.groupby('cluster').mean() # scatter plot matrix of DBSCAN cluster assignments (0=red, 1=green, 2=blue, -1=yellow) pd.scatter_matrix(X, c=colors[NHL.cluster], figsize=(10,10), s=100)
def show_dbscan(): centers = [[1, 1], [-1, -1], [1, -1]] #创建测试样本 X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) X = StandardScaler().fit_transform(X) #DBSCAN主要参数: #eps:同一聚类集合中两个样本的最大距离 #min_samples:同一聚类样本集合中最小样本数 #algorithm:算法分为:'auto','ball_tree','kd_tree','brute' #leaf_size:使用balltree或者cKDTree算法时的叶子结点的个数 #n_jobs:并发任务数 db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) # Black removed and is used for noise instead. unique_labels = set(labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) xy = X[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show()
def do_dbs(ft): return DBSCAN(eps=0.3, min_samples=10).fit(ft).labels_
print("Lower Bound:", lower_bound) upper_bound = Q3 + 1.5 * IQR print("Upper Bound:", upper_bound) df_clean = df[(df['V13'] > lower_bound) & (df['V13'] < upper_bound)] sns.boxplot(y=df_clean['V13']) plt.show() sns.scatterplot(df['V13'], df['V14']) from sklearn.cluster import DBSCAN X_train = df[['V13', 'V14']] model = DBSCAN() model.fit(X_train) cluster_labels = model.labels_ plt.scatter(df["V13"], df["V14"], c=cluster_labels) plt.show() df['labels'] = cluster_labels df_cluster_clean = df[df['labels'] != -1] plt.scatter(df_cluster_clean["V13"], df_cluster_clean["V14"], c='r') plt.xlabel('V13') plt.ylabel('V14') plt.show()
def main_worker(args): global start_epoch, best_mAP cudnn.benchmark = True sys.stdout = Logger(osp.join(args.logs_dir, 'log.txt')) print("==========\nArgs:{}\n==========".format(args)) # Create data loaders iters = args.iters if (args.iters > 0) else None ncs = [int(x) for x in args.ncs.split(',')] # ncs_dbscan=ncs.copy() dataset_target, label_dict = get_data(args.dataset_target, args.data_dir, len(ncs)) test_loader_target = get_test_loader(dataset_target, args.height, args.width, args.batch_size, args.workers) tar_cluster_loader = get_test_loader(dataset_target, args.height, args.width, args.batch_size, args.workers, testset=dataset_target.train) dataset_source, _ = get_data(args.dataset_source, args.data_dir, len(ncs)) sour_cluster_loader = get_test_loader(dataset_source, args.height, args.width, args.batch_size, args.workers, testset=dataset_source.train) train_loader_source = get_train_loader(dataset_source, args.height, args.width, 0, args.batch_size, args.workers, args.num_instances, args.iters, dataset_source.train) source_classes = dataset_source.num_train_pids distribution, _ = write_sta_im(dataset_source.train) fc_len = 3500 model_1, _, model_1_ema, _ = create_model( args, [fc_len for _ in range(len(ncs))]) # print(model_1) epoch = 0 target_features_dict, _ = extract_features(model_1_ema, tar_cluster_loader, print_freq=100) target_features = F.normalize(torch.stack( list(target_features_dict.values())), dim=1) # Calculate distance print('==> Create pseudo labels for unlabeled target domain') rerank_dist = compute_jaccard_distance(target_features, k1=args.k1, k2=args.k2) del target_features if (epoch == 0): # DBSCAN cluster eps = 0.6 # 0.6 print('Clustering criterion: eps: {:.3f}'.format(eps)) cluster = DBSCAN(eps=eps, min_samples=4, metric='precomputed', n_jobs=-1) # select & cluster images as training set of this epochs pseudo_labels = cluster.fit_predict(rerank_dist) # num_ids = len(set(pseudo_labels)) - (1 if -1 in pseudo_labels else 0) plabel = [] new_dataset = [] for i, (item, label) in enumerate(zip(dataset_target.train, pseudo_labels)): if label == -1: continue plabel.append(label) new_dataset.append((item[0], label, item[-1])) target_label = [plabel] ncs = [len(set(plabel)) + 1] print('new class are {}, length of new dataset is {}'.format( ncs, len(new_dataset))) model_1.module.classifier0_3500 = nn.Linear(2048, ncs[0] + source_classes, bias=False).cuda() model_1_ema.module.classifier0_3500 = nn.Linear(2048, ncs[0] + source_classes, bias=False).cuda() model_1.module.classifier3_0_3500 = nn.Linear(1024, ncs[0] + source_classes, bias=False).cuda() model_1_ema.module.classifier3_0_3500 = nn.Linear(1024, ncs[0] + source_classes, bias=False).cuda() print(model_1.module.classifier0_3500) # if epoch !=0: # model_1.module.classifier0_3500.weight.data.copy_(torch.from_numpy(normalize(target_centers,axis=1)).float().cuda()) # model_1_ema.module.classifier0_3500.weight.data.copy_(torch.from_numpy(normalize(target_centers,axis=1)).float().cuda()) # Initialize source-domain class centroids print("==> Initialize source-domain class centroids in the hybrid memory") source_features, _ = extract_features(model_1, sour_cluster_loader, print_freq=50) sour_fea_dict = collections.defaultdict(list) print("==> Ending source-domain class centroids in the hybrid memory") for f, pid, _ in sorted(dataset_source.train): sour_fea_dict[pid].append(source_features[f].unsqueeze(0)) source_centers = [ torch.cat(sour_fea_dict[pid], 0).mean(0) for pid in sorted(sour_fea_dict.keys()) ] source_centers = torch.stack(source_centers, 0) source_centers = F.normalize(source_centers, dim=1) del sour_fea_dict, source_features, sour_cluster_loader # Evaluator evaluator_1 = Evaluator(model_1) evaluator_1_ema = Evaluator(model_1_ema) clusters = [args.num_clusters] * args.epochs # TODO: dropout clusters k_memory = 8192 contrast = onlinememory(2048, len(new_dataset), sour_numclass=source_classes, K=k_memory + source_classes, index2label=target_label, choice_c=args.choice_c, T=0.07, use_softmax=True).cuda() contrast.index_memory = torch.cat( (torch.arange(source_classes), -1 * torch.ones(k_memory).long()), dim=0).cuda() contrast.memory = torch.cat((source_centers, torch.rand(k_memory, 2048)), dim=0).cuda() tar_selflabel_loader = get_test_loader(dataset_target, args.height, args.width, args.batch_size, args.workers, testset=new_dataset) o = Optimizer(target_label, dis_gt=distribution, m=model_1, ncl=ncs, t_loader=tar_selflabel_loader, N=len(new_dataset), fc_len=fc_len) uncertainty = collections.defaultdict(list) print("Training begining~~~~~~!!!!!!!!!") for epoch in range(len(clusters)): iters_ = 300 if epoch % 1 == 0 else iters if epoch % 6 == 0 and epoch != 0: target_features_dict, _ = extract_features(model_1_ema, tar_cluster_loader, print_freq=50) target_features = torch.stack(list(target_features_dict.values())) target_features = F.normalize(target_features, dim=1) print('==> Create pseudo labels for unlabeled target domain with') rerank_dist = compute_jaccard_distance(target_features, k1=args.k1, k2=args.k2) # select & cluster images as training set of this epochs pseudo_labels = cluster.fit_predict(rerank_dist) plabel = [] new_dataset = [] for i, (item, label) in enumerate( zip(dataset_target.train, pseudo_labels)): if label == -1: continue plabel.append(label) new_dataset.append((item[0], label, item[-1])) target_label = [plabel] ncs = [len(set(plabel)) + 1] tar_selflabel_loader = get_test_loader(dataset_target, args.height, args.width, args.batch_size, args.workers, testset=new_dataset) o = Optimizer(target_label, dis_gt=distribution, m=model_1, ncl=ncs, t_loader=tar_selflabel_loader, N=len(new_dataset), fc_len=fc_len) contrast.index_memory = torch.cat( (torch.arange(source_classes), -1 * torch.ones(k_memory).long()), dim=0).cuda() model_1.module.classifier0_3500 = nn.Linear(2048, ncs[0] + source_classes, bias=False).cuda() model_1_ema.module.classifier0_3500 = nn.Linear(2048, ncs[0] + source_classes, bias=False).cuda() model_1.module.classifier3_0_3500 = nn.Linear(1024, ncs[0] + source_classes, bias=False).cuda() model_1_ema.module.classifier3_0_3500 = nn.Linear( 1024, ncs[0] + source_classes, bias=False).cuda() print(model_1.module.classifier0_3500) # if epoch !=0: # model_1.module.classifier0_3500.weight.data.copy_(torch.from_numpy(normalize(target_centers,axis=1)).float().cuda()) # model_1_ema.module.classifier0_3500.weight.data.copy_(torch.from_numpy(normalize(target_centers,axis=1)).float().cuda()) target_label_o = o.L target_label = [ list(np.asarray(target_label_o[0].data.cpu()) + source_classes) ] contrast.index2label = [[i for i in range(source_classes)] + target_label[0]] # change pseudo labels for i in range(len(new_dataset)): new_dataset[i] = list(new_dataset[i]) for j in range(len(ncs)): new_dataset[i][j + 1] = int(target_label[j][i]) new_dataset[i] = tuple(new_dataset[i]) cc = args.choice_c #(args.choice_c+1)%len(ncs) train_loader_target = get_train_loader(dataset_target, args.height, args.width, cc, args.batch_size, args.workers, args.num_instances, iters_, new_dataset) # Optimizer params = [] flag = 1.0 # if 20<epoch<=40 or 60<epoch<=80 or 120<epoch: # flag=0.1 # else: # flag=1.0 for key, value in model_1.named_parameters(): if not value.requires_grad: print(key) continue params += [{ "params": [value], "lr": args.lr * flag, "weight_decay": args.weight_decay }] optimizer = torch.optim.Adam(params) # Trainer trainer = DbscanBaseTrainer(model_1, model_1_ema, contrast, num_cluster=ncs, alpha=args.alpha, fc_len=fc_len) train_loader_target.new_epoch() train_loader_source.new_epoch() trainer.train(epoch, train_loader_target, train_loader_source, optimizer, args.choice_c, print_freq=args.print_freq, train_iters=iters_) def save_model(model_ema, is_best, best_mAP, mid): save_checkpoint( { 'state_dict': model_ema.state_dict(), 'epoch': epoch + 1, 'best_mAP': best_mAP, }, is_best, fpath=osp.join(args.logs_dir, 'model' + str(mid) + '_checkpoint.pth.tar')) if epoch == 20: args.eval_step = 2 elif epoch == 40: args.eval_step = 1 if ((epoch + 1) % args.eval_step == 0 or (epoch == args.epochs - 1)): mAP_1 = 0 #evaluator_1.evaluate(test_loader_target, dataset_target.query, dataset_target.gallery, # cmc_flag=False) mAP_2 = evaluator_1_ema.evaluate(test_loader_target, dataset_target.query, dataset_target.gallery, cmc_flag=False) is_best = (mAP_1 > best_mAP) or (mAP_2 > best_mAP) best_mAP = max(mAP_1, mAP_2, best_mAP) save_model(model_1, (is_best), best_mAP, 1) save_model(model_1_ema, (is_best and (mAP_1 <= mAP_2)), best_mAP, 2) print( '\n * Finished epoch {:3d} model no.1 mAP: {:5.1%} model no.2 mAP: {:5.1%} best: {:5.1%}{}\n' .format(epoch, mAP_1, mAP_2, best_mAP, ' *' if is_best else ''))
import matplotlib.pyplot as plt import numpy from matplotlib.colors import ListedColormap f = open("dataset3.txt", 'r', encoding='utf-8') data = [] type = [] for line in f.readlines(): item = line.split(',') x = float(item[0]) y = float(item[1]) type.append(int(item[2]) - 1) t = [x, y] data.append(t) X = numpy.array(data) k = KMeans(n_clusters=5, ).fit(X) label = k.labels_ print(label) print(type) colors = ListedColormap( ['#FF0000', '#00FF00', '#0000FF', '#000000', '#ffcb00']) plt.scatter(X[:, 0:1], X[:, 1:2], c=label, cmap=colors) plt.show() # 数据自带分类 plt.scatter(X[:, 0:1], X[:, 1:2], c=type, cmap=colors) plt.show() # DBSCAN 算法 y_pred = DBSCAN(eps=0.1, min_samples=15).fit_predict(X) plt.scatter(X[:, 0:1], X[:, 1:2], c=y_pred, cmap=colors) plt.show()
from sklearn.cluster import DBSCAN import pandas as pd import numpy as np import matplotlib.pyplot as plt import plotly.express as px import plotly.graph_objects as go df_125 = pd.read_csv( r'C:\Users\Max\Desktop\Hackathon_Data\Task 2 - Leadec - StarterKit\IU\Time-series\1_rms_125_2_test.csv' ) df_125['date'] = pd.to_datetime(df_125['timestamp']) del df_125['timestamp'] clustering1 = DBSCAN(eps=0.05, min_samples=3).fit( np.array(df_125['max_audio']).reshape(-1, 1)) labels = clustering1.labels_ outlier_pos = np.where(labels == -1)[0] n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print(labels) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) x = [] y = [] for pos in outlier_pos: x.append(np.array(df_125['max_audio'])[pos]) y.append(df_125['max_audio'].index[pos])
def runClusterer(clusterer_name,params,data,param_scale='',metricstring=''): #print('S2 runClusterer>>>') from time import time #----------------------------------s1 读取数据 #如果data[0]存储的是字符串,则读出data[0],data[1],即训练数据和标签位置 if isinstance(data[0],str): X,y,size = loadPictureData(data[0],data[1],data[2]) SX = X #如果存储的不是字符串,那就是直接能用的向量,直接存储就行,各分量自动存储 else: X,SX,y,size = data #print('S2 data load done') #----------------------------------s2 参数缩放 # params: (5,10,) param_scale: (1,100,) # ture params : (5,0.1,) # 建议meanshift ,dbsacan eps /10 if param_scale != '': params = list(params) for i in range(0,len(params)): params[i] /= param_scale[i] #s2 选择聚类器 #kmeans 需指定k if clusterer_name == 'kmeans': from sklearn.cluster import KMeans clusterer = KMeans(init='k-means++', n_clusters=int(params[0]), n_init=10) ms = 'sc' elif clusterer_name == 'dbscan': from sklearn.cluster import DBSCAN # 0.5,10 注意!! eps 被缩小一个尺度!!! clusterer = DBSCAN(eps=params[0], min_samples=params[1]) ms = 'sc' #birch 需指定k elif clusterer_name == 'birch': # None,0.5,50 from sklearn.cluster import Birch clusterer = Birch(n_clusters = params[0], threshold = params[1], branching_factor = params[2]) ms = 'sc' #optics elif clusterer_name == 'optics': from sklearn.cluster import OPTICS clusterer = OPTICS(min_samples=int(params[0]))#,xi=params[1],min_cluster_size=params[2]) #OPTICS(min_samples = 10, xi = 0.05, min_cluster_size = 0.05) ms = 'sc' #Spectral 需指定k elif clusterer_name == 'spectral': pass #clusterer = SpectralClustering(n_clusters = params[0], assign_labels = params[1], random_state = params[2]) elif clusterer_name == 'hierarch': from sklearn.cluster import AgglomerativeClustering #clusterer = AgglomerativeClustering(n_clusters=params[0],affinity=params[1],linkage=params[2])#'canberra',linkage='complete') clusterer = AgglomerativeClustering(n_clusters=int(params[0]), affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='average')#, distance_threshold=None) ms = 'sc' elif clusterer_name == 'meanshift': from sklearn.cluster import MeanShift,estimate_bandwidth #0.2,500 bandwidth = estimate_bandwidth(X, quantile=params[0], n_samples=params[1]) clusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms = 'sc' else: print('no cluster name specify') import sys sys.exit(0) if metricstring == '': metricstring = ms #s3 正式运行聚类 t0 = time() clusterer.fit(X) t1 = time() infoDict = {'clusterer':clusterer,'clusterer_name':clusterer_name,'params':params,'metricstring':metricstring} # 聚类器,聚类器生成字符串,度量列表字符串 dataDict = {'X':X,'SX':SX,'y':y,'size':size} # 存储数据的字典,三样全 performanceDict = {'time':t1-t0,'clusters_num':max(clusterer.labels_)+1} # 存储表现的字典,先存储时间和聚类数量 clusterer_container = {'info':infoDict ,'data':dataDict,'performance':performanceDict} #print('S4 done.<<<') return clusterer_container
def clustering(feature, eps, minPoints): dbscan = DBSCAN(eps=eps, min_samples=minPoints) dbscan.fit(feature) pred = dbscan.labels_ return pred
def touchdowns(image, n): """ Function to obtain the locations of the touchdown passes from the image of the pass chart using k-means, and DBSCAN to account for difficulties in extracting touchdown passes, since they have the are the same color as both the line of scrimmage and the attached touchdown trajectory lines. Input: image: image from the folder 'Cleaned_Pass_Charts' n: number of toucndowns, from the corresponding data of the image Return: call to map_pass_locations: centers: list of pass locations in pixels col: width of image from which the pass locations were extracted pass_type: "TOUCHDOWN" """ im = Image.open(image) pix = im.load() col, row = im.size img = Image.new('RGB', (col, row), 'black') p = img.load() for i in range(col): for j in range(row): r = pix[i, j][0] g = pix[i, j][1] b = pix[i, j][2] if (col < 1370) and (j < row - 105) and (j > row - 111): if (b > 2 * g) and (b > 60): p[i, j] = (0, 0, 0) elif (col > 1370) and (j < row - 81) and (j > row - 86): if (b > 2 * g) and (b > 60): p[i, j] = (0, 0, 0) else: p[i, j] = pix[i, j] r = p[i, j][0] g = p[i, j][1] b = p[i, j][2] f = ((r - 20)**2 + (g - 80)**2 + (b - 200)**2)**0.5 if f < 32 and b > 100: p[i, j] = (255, 255, 0) scipy.misc.imsave('temp.jpg', img) imag = cv2.imread('temp.jpg') os.remove('temp.jpg') hsv = cv2.cvtColor(imag, cv2.COLOR_BGR2HSV) lower = np.array([20, 100, 100]) upper = np.array([30, 255, 255]) mask = cv2.inRange(hsv, lower, upper) res = cv2.bitwise_and(imag, imag, mask=mask) res = cv2.cvtColor(res, cv2.COLOR_HSV2RGB) res = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY) res = cv2.fastNlMeansDenoising(res, h=10) x = np.where(res != 0)[0] y = np.where(res != 0)[1] pairs = zip(x, y) X = map(list, pairs) if (len(pairs) != 0): db = DBSCAN(eps=10, min_samples=n).fit(X) labels = db.labels_ coords = pd.DataFrame([x, y, labels]).T coords.columns = ['x', 'y', 'label'] clusters = Counter(labels).most_common(n) td_labels = np.array([clust[0] for clust in clusters]) km_coords = coords.loc[coords['label'].isin(td_labels)] km = map(list, zip(km_coords.iloc[:, 0], km_coords.iloc[:, 1])) kmeans = KMeans(n_clusters=n, random_state=0).fit(km) centers = kmeans.cluster_centers_ return map_pass_locations(centers, col, "TOUCHDOWN") else: return map_pass_locations([], col, "TOUCHDOWN", n)
def cps(img, points, lines): """ Chessboard position search in the given image. :param img: Image to search. :param points: Points obtained in laps. :param lines: Lines detected by slid. :return: The four inner points of the detected chessboard. """ ptp_cache = {} def ptp_distance(a, b): """ Distance from point to point with a cache to avoid multiple calculations. """ idx = hash("__dis" + str(a) + str(b)) if idx in ptp_cache: return ptp_cache[idx] ptp_cache[idx] = math.sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2) return ptp_cache[idx] points = __check_correctness(__normalize(points), img.shape) # Clustering __points = {} points = __sort_points(points) __max = 0 __points_max = [] alfa = math.sqrt(cv2.contourArea(np.array(points)) / 49) X = DBSCAN(eps=alfa * 4).fit(points) for i in range(len(points)): __points[i] = [] for i in range(len(points)): if X.labels_[i] != -1: __points[X.labels_[i]].append(points[i]) for i in range(len(points)): if len(__points[i]) > __max: __max = len(__points[i]) __points_max = __points[i] if len(__points) > 0 and len(points) > 49 / 2: points = __points_max n = len(points) beta = n * (5 / 100) # beta = n * (100 - (CPS efectiveness)) alfa = math.sqrt(cv2.contourArea(np.array(points)) / 49) # We are looking for the focal point of the cluster x = [p[0] for p in points] y = [p[1] for p in points] centroid = (sum(x) / len(points), sum(y) / len(points)) def __v(l): y_0, x_0 = l[0][0], l[0][1] y_1, x_1 = l[1][0], l[1][1] x_2 = 0 t = (x_0 - x_2) / (x_0 - x_1 + 0.0001) a = [int((1 - t) * x_0 + t * x_1), int((1 - t) * y_0 + t * y_1)][::-1] x_2 = img.shape[0] t = (x_0 - x_2) / (x_0 - x_1 + 0.0001) b = [int((1 - t) * x_0 + t * x_1), int((1 - t) * y_0 + t * y_1)][::-1] poly1 = __sort_points([[0, 0], [0, img.shape[0]], a, b]) s1 = __polyscore(np.array(poly1), points, centroid, alfa / 2, beta) poly2 = __sort_points( [a, b, [img.shape[1], 0], [img.shape[1], img.shape[0]]]) s2 = __polyscore(np.array(poly2), points, centroid, alfa / 2, beta) return [a, b], s1, s2 def __h(l): x_0, y_0 = l[0][0], l[0][1] x_1, y_1 = l[1][0], l[1][1] x_2 = 0 t = (x_0 - x_2) / (x_0 - x_1 + 0.0001) a = [int((1 - t) * x_0 + t * x_1), int((1 - t) * y_0 + t * y_1)] x_2 = img.shape[1] t = (x_0 - x_2) / (x_0 - x_1 + 0.0001) b = [int((1 - t) * x_0 + t * x_1), int((1 - t) * y_0 + t * y_1)] poly1 = __sort_points([[0, 0], [img.shape[1], 0], a, b]) s1 = __polyscore(np.array(poly1), points, centroid, alfa / 2, beta) poly2 = __sort_points( [a, b, [0, img.shape[0]], [img.shape[1], img.shape[0]]]) s2 = __polyscore(np.array(poly2), points, centroid, alfa / 2, beta) return [a, b], s1, s2 pregroup = [[], []] # Division into 2 groups (for the frame) for l in lines: # We will review all of the lines # We reject lines that pass through the center of the cluster if __ptl_distance(l, centroid, ptp_distance(*l)) > alfa * 2.5: for p in points: # We check that the line passes near a good point if __ptl_distance(l, p, ptp_distance(*l)) < alfa: # The line belongs to the ring tx, ty = l[0][0] - l[1][0], l[0][1] - l[1][1] if abs(tx) < abs(ty): ll, s1, s2 = __v(l) orientation = 0 else: ll, s1, s2 = __h(l) orientation = 1 if s1 == 0 and s2 == 0: continue pregroup[orientation].append(ll) pregroup[0] = __remove_duplicates(pregroup[0]) pregroup[1] = __remove_duplicates(pregroup[1]) if debug.DEBUG: # We create an outer ring def convex_approx(points, alfa=0.01): points = np.array(points) hull = ConvexHull(points).vertices cnt = points[hull] approx = cv2.approxPolyDP(cnt, alfa * cv2.arcLength(cnt, True), True) return __normalize(itertools.chain(*approx)) ring = convex_approx(__sort_points(points)) debug.DebugImage(img) \ .lines(lines, color=(0, 0, 255)) \ .points(points, color=(0, 0, 255)) \ .points(ring, color=(0, 255, 0)) \ .points([centroid], color=(255, 0, 0)) \ .save("cps_debug") debug.DebugImage(img) \ .lines(pregroup[0], color=(0, 0, 255)) \ .lines(pregroup[1], color=(255, 0, 0)) \ .save("cps_pregroups") score = {} # Frame ranking with the result for v in itertools.combinations(pregroup[0], 2): # Horizontal for h in itertools.combinations(pregroup[1], 2): # Vertical poly = [ __intersection(v[0], v[1]), __intersection(v[0], h[0]), __intersection(v[0], h[1]), __intersection(v[1], h[0]), __intersection(v[1], h[1]), __intersection(h[0], h[1]) ] poly = __check_correctness(poly, img.shape) if len(poly) != 4: continue poly = np.array(__sort_points(__normalize(poly))) if not cv2.isContourConvex(poly): continue score[-__polyscore(poly, points, centroid, alfa / 2, beta)] = poly score = collections.OrderedDict(sorted(score.items())) K = next(iter(score)) inner_points = __normalize(score[K]) inner_points = __order_points(inner_points) debug.DebugImage(img) \ .points(points, color=(0, 255, 0)) \ .points(inner_points, color=(0, 0, 255)) \ .points([centroid], color=(255, 0, 0)) \ .lines([[inner_points[0], inner_points[1]], [inner_points[1], inner_points[2]], [inner_points[2], inner_points[3]], [inner_points[3], inner_points[0]]], color=(255, 255, 255)) \ .save("cps_debug_2") return __padcrop(img, inner_points)
import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import KMeans from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import DBSCAN from sklearn import datasets x, y = datasets.make_moons(n_samples=1500, noise=0.09) plt.scatter(x[:, 0], x[:, 1], s=5) cores = np.array(['red', 'blue']) # KMeans kmeans = KMeans(n_clusters=2) previsoes = kmeans.fit_predict(x) plt.scatter(x[:, 0], x[:, 1], color=cores[previsoes]) # Hierarquico hc = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward') previsoes = hc.fit_predict(x) plt.scatter(x[:, 0], x[:, 1], color=cores[previsoes]) # DBSCAN dbscan = DBSCAN(eps=0.1) previsoes = dbscan.fit_predict(x) plt.scatter(x[:, 0], x[:, 1], color=cores[previsoes])
def plot_cluster_map(X, quantile_group, range_low, range_high, range_axis, data_column_i, OUTPUT_CHARTS_DIR, number_to_name_dict={}, eps=4, min_samples=10, save_plot=True): import numpy as np import pandas as pd from sklearn.cluster import DBSCAN import os from sklearn import metrics from sklearn.preprocessing import StandardScaler # ############################################################################# # Compute DBSCAN db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) n_clusters_points_ = len(X) - n_noise_ if n_clusters_ > 0: print('Data Column Name: %s' % data_column_i) print('Quantile Group: %s' % quantile_group) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of clusters points: %d' % n_clusters_points_) print('Estimated number of noise points: %d' % n_noise_) print('Range between : %s and %s' % (range_low, range_high)) # ############################################################################# # Plot result if save_plot: import matplotlib.pyplot as plt # Black removed and is used for noise instead. unique_labels = set(labels) colors = [ plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels)) ] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], '.', color=tuple(col), markersize=3) # xy = X[class_member_mask & ~core_samples_mask] # plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), # markeredgecolor='k', markersize=1) plt.title('Between %s and %s : %d clusters and % d points' % (round(range_low, 3), round( range_high, 3), n_clusters_, n_clusters_points_)) plt.axis(range_axis) F = plt.gcf() chart_name = data_column_i if bool(number_to_name_dict): if str(data_column_i) in number_to_name_dict: chart_name = number_to_name_dict[str(data_column_i)] F.savefig(os.path.join( OUTPUT_CHARTS_DIR, '{0}_quantile_group_{1}.png'.format(chart_name, quantile_group)), dpi=(500)) # plt.show() plt.clf() return (labels)
def ji_cluster_dbscan(df_trades, cccy, cdate, alltime, allorderIdNum, wstart=3, wend=8, max_dist=0.25, showplot=1, showtext=0): """Clustering by time and orderIdNum(converted to AscII). Converting to AcsII will make to clusters that have similar IDs on the left-side. - Density-Based Spatial Clustering and Application with Noise (DBSCAN) is used since it is appropriate for finding dense trades clusters. Clan is defined around the center based on distance btwn samples. So it will cluster samples that are close each other. - It seems to work well with small and large sample sizes, thus more robust. """ res = matlab_like() # alltime = df_trades.orderstart.apply(ji_time_str2sec).values # Apply outside once for efficiency allorderId = df_trades.orderid.values # allorderIdNum = df_trades.orderid.apply(ji_nlp_word2asc,n=wend-wstart+1).values # Particular currency and date mask = (df_trades.ccy == cccy) & (df_trades.trdate == cdate) masked_orders = allorderIdNum[mask] corderIds = [s[wstart:wend] for s in allorderId[mask]] # for visualization masked_time = alltime[mask] X = [[i, j] for i, j in zip(masked_time, masked_orders)] # seconds, Asc(orderId) # ############################################################################# # Generate sample data # centers = [[1, 1], [-1, -1], [1, -1]] # X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, # random_state=0) # Standardize do_scale = 1 X0 = np.asarray(X) if do_scale: X = StandardScaler().fit_transform(X) else: X = np.asarray(X) # ############################################################################# # Compute DBSCAN db = DBSCAN(eps=max_dist, min_samples=3).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # Store res.labels = labels res.label_info = '-1 is for the noise, not counted as cluster' res.n_clusters = n_clusters_ res.db = db if showplot: print( '******************* DBSCAN using feats: time, orderIdNum ********************' ) # ############################################################################# # Plot result # Black removed and is used for noise instead. unique_labels = set(labels) # colors = [plt.cm.Spectral(each) # Sample colormaps: 'PiYG', 'PRGn', 'BrBG', 'PuOr', 'RdGy', 'RdBu', # 'RdYlBu', 'RdYlGn', 'Spectral', 'coolwarm', 'bwr', 'seismic' colors = [ plt.cm.RdYlBu(each) for each in np.linspace(0, 1, len(unique_labels)) ] plt.figure(figsize=(20, 10)) cont = 0 mycolors = [ '#ff2ff0', '#ff880f', '#ff0000', '#00ff00', '#0000ff', '#ff00ff', '#00ffff', '#ff0088', '#ff8800', '#0088ff' ] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) # k>=0) standard clusters xy = X[class_member_mask & core_samples_mask] xy0 = X0[class_member_mask & core_samples_mask] # plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), # markeredgecolor='k', markersize=14) if showtext: plt.plot(xy0[:, 0] / 3600, xy[:, 1], '.', markerfacecolor=mycolors[cont % 10], markeredgecolor='y', markersize=1) # add annotation # print('k:',k,xy0[:, 0]/3600, xy[:, 1]) for cx, cy in zip(xy0[:, 0] / 3600, xy[:, 1]): plt.annotate(k, (cx, cy), horizontalalignment='center', verticalalignment='center', fontsize=20, color=mycolors[cont % 10]) else: plt.plot(xy0[:, 0] / 3600, xy[:, 1], 'o', markerfacecolor=mycolors[cont % 10], markeredgecolor='k', markersize=14) # -1) noise clusters xy = X[class_member_mask & ~core_samples_mask] xy0 = X0[class_member_mask & ~core_samples_mask] plt.plot(xy0[:, 0] / 3600, xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6) # plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=mycolors[cont%8], # markeredgecolor='k', markersize=6) cont += 1 # plt.title('Estimated number of clusters: %d' % n_clusters_) plt.xlabel('time(hours)', size=20) plt.ylabel('orderId Group (ascII)', size=20) plt.title('%s: %s (%d trades, %d clusters) - DBSCAN(time,orderId)' % (cccy, cdate, len(masked_time), n_clusters_), size=20) plt.show() print('******************') for i, j, k in zip(corderId, labels, masked_time): print('%s: %d (%1.1f hours)' % (i, j, k / 3600)) print('****************** (END) ********************** \n') return res
readCSV = csv.reader(csvdata, delimiter=',') next(readCSV) # Skipping Header content for row in readCSV: for i in entityIndex: if (row[i] in Attributes[str(i)]) == False: Attributes[str(i)][row[i]] = len(Attributes[str(i)].keys()) datarow = [] try: for i in entityIndex: datarow.append(Attributes[str(i)][row[i]]) dataAttributes.append(numpy.asarray(datarow)) except ValueError: pass feats = ["Age", "Number of Cigarettes per Day"] print("Features: ", feats) print("Done Loading Data\n") # seperating test and training data dataAttributes = numpy.asarray(dataAttributes) print("DBSCAN Clustering Started") DBSCluster = DBSCAN(eps=1, min_samples=150).fit(dataAttributes) print("DBSCAN Clustering Finished\n") # adding 10 as the first key it will take to b 0 pyplot.scatter(dataAttributes.T[0] + 10, dataAttributes.T[1], c=DBSCluster.labels_) pyplot.show()
data_scale = 50 gm_centers = np.random.rand(class_num, feat_dim) * data_scale # uniform, (0 ,1) gm_stds = np.random.rand(class_num) * 0.5 + 1 gm_colors = np.random.rand(class_num*3, 3) * 0.8 + 0.2 gm_X, gm_y = make_blobs(n_samples = data_amout, n_features = feat_dim, centers = gm_centers, cluster_std = gm_stds , random_state = 9) gm_X = gm_X[:, :select_feat_dim] print("[info] data amout: %04d data dim: %04d"%(gm_X.shape[0], gm_X.shape[1])) plt.scatter(gm_X[:, 0], gm_X[:, 1], marker='o', s = 10) plt.scatter(gm_centers[:, 0], gm_centers[:, 1], marker='x', s = 25, c = "r") plt.title("Orignal Blob Dist(First 2 dims). ") plt.show() gm_cluster = DBSCAN(eps = 9.5, min_samples = 10) # gm_cluster.fit(gm_X) # training y_pred = gm_cluster.fit_predict(gm_X) # or gm_cluster.labels_ n_clusters_pred = len(np.unique(y_pred)) # center_pred = gm_cluster.cluster_centers_ plt.scatter(gm_X[:, 0], gm_X[:, 1], c = gm_colors[y_pred.tolist()]) plt.scatter(gm_centers[:, 0], gm_centers[:, 1], marker='x', s = 25, c = "r") # plt.scatter(center_pred[:, 0], center_pred[:, 1], marker='x', s = 25, c = "k") plt.title("DBSCAN Cluster. use feat_dim = %d, eps = %.1f, #class = %d"%(select_feat_dim, gm_cluster.eps, n_clusters_pred)) plt.show()
] to_revert = test.groupby(['Labels'])['1stPolYear', 'BirthYear', 'GrossMthSalary', 'GeoLivArea', 'HasChild'].mean() #to_revert = to_revert.loc[:,-'Index'] my_scaler.inverse_transform(X=to_revert) test['Labels'].value_counts() #### DBSCAN from sklearn.cluster import DBSCAN from sklearn import metrics db = DBSCAN(eps=0.2, min_samples=5).fit(test) labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) unique_clusters, counts_clusters = np.unique(db.labels_, return_counts=True) print(np.asarray((unique_clusters, counts_clusters))) from sklearn.decomposition import PCA pca = PCA(n_components=None).fit(test) pca_2d = pca.transform(test) explained_variance = pca.explained_variance_ratio_ from sklearn.decomposition import PCA
plt.scatter(X[:, 0], X[:, 1], marker='o', c=label_color, s=25, edgecolor='k') plt.show() print('### KMEANS CORRECTNESS') f.benchmark(labels, y) NN = NearestNeighbors(n_neighbors=int(np.log(len(X)))).fit(X) distances, indices = NN.kneighbors(X) fig = plt.figure() plt.plot(np.sort(distances[:, distances.shape[1] - 1]), color='red') ###### S1 DBSCAN ###### eps = 27000 min_samples = np.log(len(X)) db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) labels = db.labels_ label_color = [f.LABEL_COLOR_MAP[l] for l in labels] unique, counts = np.unique(labels, return_counts=True) print('#FINAL:' + str(dict(zip(unique, counts)))) fig = plt.figure() plt.scatter(X[:, 0], X[:, 1], marker='o', c=label_color, s=25, edgecolor='k') plt.show() print('### DBSCAN CORRECTNESS') f.benchmark(labels, y) ###### S1 MYALG ##### n = 25 labels = f.squareBFS(X, n) print('### MYALG CORRECTNESS')
print("*** DBSCAN clustering ***") print("---------------------------------") import matplotlib.pyplot as plt from sklearn.cluster import DBSCAN print("Image shape: ", np.shape(the_image)) the_image_list = the_image print("Image code got from autoencoder") image_autoencoded = [ my_net.getCode(torch.Tensor(point)).detach().numpy() for point in the_image_list ] print("Runing fit function for DBSCAN clustering") clust = DBSCAN(eps=3, min_samples=2).fit(image_autoencoded) print("Creating list for clastered data") clustered_data = np.zeros((100, 100)) print("Clustered data shape: ", np.shape(clustered_data)) x = 0 y = 0 for i in range(np.shape(clustered_data)[0] * np.shape(clustered_data)[1]): clustered_data[x][y] = clust.labels_[i] x = x + 1 if x == 100: x = 0 y = y + 1
title="sklearn库中kmeans分类后的点集") #利用自己建立的kmeans模型对dots做分类 pred_tags = my_kmeans(dots, class_num) scatter_dots(dots, pred_tags, new_plot=False, subplot=223, title="自己训练得到的kmeans分类后的点集") plt.show() dots1, tags1 = generate_dots2(200) scatter_dots(dots1, tags1, new_plot=True, subplot=221, title="原始点集") #利用sklearn库中的DBScan算法对dots做分类 model1 = DBSCAN(eps=2, min_samples=1) db_pred_tags = model1.fit_predict(dots1) scatter_dots(dots1, db_pred_tags, new_plot=False, subplot=222, title="sklearn库中dbscan分类后的点集") #利用自己训练的dbscan算法最dots做分类 mydb_pred_tags = my_dbscan(dots1, epsilon=2, minSamples=1) scatter_dots(dots1, mydb_pred_tags, new_plot=False, subplot=223, title="自己训练得到的dbscan分类后的点集") plt.show()