def hierarchicalClusteringScipy(self, vectorLayer, attributesList, normalize, clusterThreshold, linkageMethod, criterion, metric, depth, max_clust, outputFieldName): import scipy.cluster.hierarchy as hcluster from numpy import array fullObjectsList = [] features = vectorLayer.getFeatures() for feature in features: fullObjectsList.append([]) for attribute in attributesList: if feature[attribute[0]]: fullObjectsList[len(fullObjectsList) - 1].append(feature[attribute[0]]) else: fullObjectsList[len(fullObjectsList) - 1].append(0) # NORMALIZING if normalize: i = 0 maxValues = [] while i < len(attributesList): maxValues.append(max(abs(item[i]) for item in fullObjectsList)) i += 1 j = 0 while j < len(fullObjectsList): i = 0 while i < len(fullObjectsList[j]): fullObjectsList[j][i] = (fullObjectsList[j][i] * 1.0) / (maxValues[i] * 1.0) i += 1 j += 1 data = array(fullObjectsList) if criterion == 'maxclust': clusters = hcluster.fclusterdata(data, t=max_clust, criterion=criterion, method=linkageMethod, metric=metric, depth=depth) else: clusters = hcluster.fclusterdata(data, t=clusterThreshold, criterion=criterion, method=linkageMethod, metric=metric, depth=depth) vectorLayerDataProvider = vectorLayer.dataProvider() # ## Create field of not exist if vectorLayer.fields().indexFromName(outputFieldName) == -1: vectorLayerDataProvider.addAttributes([QgsField(outputFieldName, QVariant.Int)]) # vectorLayer.updateFields() vectorLayer.startEditing() attrIdx = vectorLayer.fields().indexFromName(outputFieldName) features = vectorLayer.getFeatures() # i = 0 for feature in features: vectorLayer.changeAttributeValue(feature.id(), attrIdx, int(clusters[i])) i += 1 # vectorLayer.updateFields() vectorLayer.commitChanges()
def cluster_peaks_by_lane(peak_pos, hdist=8.0, return_sorted=True): """ :param peak_pos: :param hdist: :param return_sorted: :return: Refs: http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fclusterdata.html http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html https://web.archive.org/web/20100619134310/http://www.plantbio.ohiou.edu/epb/instruct/multivariate/Week7Lectures.PDF Linkage methods: single linkage - produces "chains" complete linkage - produces "sperical" clusters intermediate linkage - Other clustering methods: UPGMA - WPGMA - UPGMC - WPGMC - K-means - cluster into exactly K number of clusters """ hdist = float(hdist) # ensure float/numeric input if hdist is None: hdist = 8.0 xpos = np.array([[pos[1]] for pos in peak_pos]) # printarr(xpos, "xpos") # maybe add a little bit of y-position to the mix? # xpos = np.array([[pos[1], pos[0]/100] for pos in peak_pos]) lane_clusters = fclusterdata(xpos, t=0.2) # fclusterdata(X, t) is for N observations each with M variables. lane_clusters = fclusterdata(xpos, t=hdist, criterion='distance', metric='euclidean', depth=2, method='single') # lane_clusters = linkage(xpos) # defaults to 'single', 'euclidean' # group lane-clustered peaks: lane_id -> array of peak pos. peaks_by_lane = defaultdict(list) for lane_id, pos in zip(lane_clusters, peak_pos): peaks_by_lane[lane_id].append(list(pos)) # convert for lane_id in peaks_by_lane: peaks_by_lane[lane_id] = np.array(peaks_by_lane[lane_id]) # pprint(peaks_by_lane) if return_sorted: # sort by mean x-position (indexing as [y, x] aka [row, col]) peaks_by_lane = OrderedDict(sorted(peaks_by_lane.items(), key=lambda kv: kv[1][:, 1].mean())) # pprint(list(peaks_by_lane.values())) return peaks_by_lane
def pcaCode(): ##Question: PCA descriptors, or PCA final profiles? #Principal Component Analysis pca = deco.PCA(n_components = 10) Xp = pca.fit_transform(X) #Z = hier.linkage(X) Y = hier.fclusterdata(X, 1.15) print "Num. Clusters (no PCA): %s"%max(Y) Yp = hier.fclusterdata(Xp, 1.15) print "Num. Clusters (with PCA): %s"%max(Yp)
def cluster_qs(qs, k=None, threshold=1.5): """Cluster q vectors into discrete groups. Classifies each of the q vectors into a number of clusters. The number of clusters used is decided by the parameters passed: * If the k parameter is supplied then the q vectors are grouped into k clusters using kmeans. * If the threshold parameter is supplied then the q vectors a split into groups based on cophenetic distance. :param qs: list of q vectors to cluster. Each element should be a numpy array of length three. :param k: number of clusters to use (optional). :param threshold: cophenetic distance cut off point for new clusters (optional) :returns: tuple (clusters, k) Where: list -- clusters is a list of cluster indicies which each q belongs to int -- k is the number of clusters used """ if k is not None: centroids = kmeans_plus_plus(qs, k) _, clusters = kmeans2(qs, centroids, minit='matrix') if len(set(clusters)) != k: raise ValueError("Could not group the satellite reflections " "into {} clusters. Please check that you have " "at least {} satellites.".format(k,k)) else: clusters = hcluster.fclusterdata(qs, threshold, criterion="distance") return clusters, len(set(clusters))
def identify(image, colors): global pixelCounters num_colors = 1 #data = numpy.zeros((1000,2)) n = 0 a = 0 for x in xrange(0, image.shape[0]): for y in xrange(0, image.shape[1]): a += 1 if a & 0b1111111 != 0: continue continue for i in range(num_colors): hue = image[x, y, 0] sat = image[x, y, 1] val = image[x, y, 2] if hue >= 0 and hue < 10 and sat > 150 and val > 50: data[n, 0] = x data[n, 1] = y n += 1 if n < 2: return (None, None) t = 30 data = data[0:n, :] clusters = hcluster.fclusterdata(data, t, criterion="distance") return (data, clusters)
def calc_best_result(coords, threshold=0.01): """ Calculates most possible result based on clustering of provided coordinates. We assume that the bigger cluster represents the value most of the best agent have agreed on. Method uses SciPy's hierarchy.fclusterdata function. Parameters ---------- coords : list of two-element tuples coordinates to guess result from threshold : float see documentation for scipy.hierarchy.fclusterdata Returns ------- x : float x coordinate of the result y : float y coordinate of the result """ coords = np.array(coords) t = coords[:,0].std() idx = hierarchy.fclusterdata(coords, threshold * t) best = int(stats.mode(idx)[0][0]) ans = np.array([coords[i] for i in range(len(coords)) if idx[i] == best]) return namedtuple('Ans', 'x, y')(ans[:,0].mean(), ans[:,1].mean())
def cluster(points, thresh): #the x,y,z points must first be separated out ndata = [[],[],[]] npts = (len(points)-2)/3 for j in range(0,npts): x = float(points[2 + 3*j]) y = float(points[3 + 3*j]) z = float(points[4 + 3*j]) ndata[0].append(x) ndata[1].append(y) ndata[2].append(z) data = np.asarray(ndata) clusterlist = hcluster.fclusterdata(np.transpose(data), thresh, criterion="distance") nclusters = findLargest(clusterlist) #initializes an array to the right size #http://stackoverflow.com/questions/7745562/appending-to-2d-lists-in-python clusters = [[] for i in range(nclusters)] #assingns points to the correct cluster for i in range(0, npts): #print clusters[clusterlist[i]-1] clusters[clusterlist[i]-1].append([ndata[0][i],ndata[1][i],ndata[2][i]]) return [data, clusterlist, clusters]
def searchForColorPoints(im, criteria): points = [] pointColors = [] hsvIm = cv2.cvtColor(im, cv2.COLOR_BGR2HSV_FULL) for i in range(11, im.shape[1] - 11, 10): for j in range(11, im.shape[0] - 11, 10): b = block(hsvIm, (i, j), 8) if b[:, :, 0].std() > 25: continue color = (b[:, :, 0].mean(), b[:, :, 1].mean(), b[:, :, 2].mean()) matchedColor = matchColor(color, criteria) if matchedColor >= 0: points.append((i, j)) pointColors.append(matchedColor) points = np.array(points, np.float16) cluster = fclusterdata(points, 10, "distance") centroids = [] for i in range(len(criteria)): centroids.append([]) for i in range(1, cluster.max() + 1): b = cluster == i c = np.zeros((1, 2), np.int16) for p in points[b.argsort()[len(b) - sum(b) :]]: c = c + p / sum(b) centroids[pointColors[b.argsort()[len(b) - sum(b)]]].append(c[0]) return centroids
def merge_paths(rides): waypoints = list(itertools.chain(*[ride.route.waypoints for ride in rides])) waypoints = sorted(waypoints, key=lambda x: x.country) logger.info("Merging {} rides with {} total waypoints".format(len(rides), len(waypoints))) for country, group in itertools.groupby(waypoints, key=lambda x: x.country): waypoints = list(group) country_lat_lng_points = [(x.lat, x.lng) for x in waypoints] country_xyz_points = [latlng_to_xyz(lat, lng) for lat, lng in country_lat_lng_points] logger.debug("Processing {} with {} waypoints".format(country, len(country_xyz_points))) wh = whiten(country_xyz_points) k_guess = max(1,len(country_xyz_points)/BEARABLE_CLUSTER_SIZE) k_centroids = kmeans(wh,k_guess)[0] k_labels = vq(wh, k_centroids)[0] k_labeled = sorted(zip(country_xyz_points,country_lat_lng_points,waypoints,k_labels), key=lambda x: x[3]) logger.debug("Got {} miniclusters".format(len(k_centroids))) for key, gr in itertools.groupby(k_labeled, key=lambda x:x[3]): gr = list(gr) k_waypoints = [x[2] for x in gr] k_lat_lng_points = [x[1] for x in gr] k_xyz_points = [x[0] for x in gr] logger.debug("Running {} minicluster with {} waypoints".format(key, len(k_waypoints))) cluster_labels = fclusterdata(np.array(k_xyz_points), 0.2, criterion="distance", metric="euclidean") centroids = cluster_centroids(zip(k_lat_lng_points, cluster_labels)) logger.debug("Got {} hierarhical clusters".format(len(set(cluster_labels)))) for i in range(0, len(k_waypoints)): new_lat, new_lng = centroids[cluster_labels[i]-1] k_waypoints[i].lat = new_lat k_waypoints[i].lng = new_lng
def cluster_lane_peaks_to_bands(lane_peaks, vdist=5.0, img=None): vdist = float(vdist) # ensure float/numeric input # Special case, lane only has a single peak, nothing to cluster: if len(lane_peaks) < 2: this_lane_bands_peaks = {0: lane_peaks} # ensure we have a dict of peaks # print("lane_id %s has only %s peaks" % (lane_id, len(lane_peaks))) else: # sort by row (y-coordinate): # print("sorting bands in lane_id %s by y position (pos[0])" % lane_id) band_clusters = fclusterdata(lane_peaks, t=vdist, criterion='distance', metric='euclidean', depth=2, method='single') # lane_band_cluster_ids[lane_id] = band_clusters # print("lane_id", lane_id) # print("lane_peaks", lane_peaks) # print("band_clusters", band_clusters) # group, method (1) using defaultdict: # cannot use dict.fromkeys, because it only takes static default values, not types/functions. this_lane_bands_peaks = defaultdict(list) for band_id, pos in zip(band_clusters, lane_peaks): this_lane_bands_peaks[band_id].append(pos) # alternative grouping methods: (2) zip, sort, then groupby; # print("this_lane_bands_peaks", this_lane_bands_peaks) # convert to nparray and take mean: # convert the list of peaks for each band to ndarray: for band_id in this_lane_bands_peaks: this_lane_bands_peaks[band_id] = np.array(this_lane_bands_peaks[band_id]) return this_lane_bands_peaks
def magic_fragmentation(self): """ This function takes the atom objects and tries to separate two fragments by a k-means-clustering algorithm. Always check the result before relying on those fragmentations!""" #hardcoded number of fragments, for now always 2! nr_frags = 2 coordinates = self.dimer.get_positions() # #centroids,_ = kmeans(coordinates, nr_frags) # assign indices to clusters (bitmask!) cluster_indices = fclusterdata(coordinates, self.magic_cutoff, criterion="distance") # compress the whole coordinates to fragments #coords_frag1 = np.array(list(itertools.compress(coordinates.tolist(), cluster_indices))) # invert the bitmask #cluster_indices = cluster_indices ^ 1 #coords_frag2 = np.array(list(itertools.compress(coordinates.tolist(), cluster_indices))) self.frag1 = deepcopy(self.dimer) self.frag2 = deepcopy(self.dimer) # Now delete the atoms of the other fragment from the object with mighty pythonic list comprehensions! del self.frag1[[atom.index for pos, atom in enumerate(self.frag1) if cluster_indices[pos] != 1]] del self.frag2[[atom.index for pos, atom in enumerate(self.frag2) if cluster_indices[pos] != 2]] print("Finished automatic fragmentation, please remember to check the result!") self.__check_fragments__() self.__set_charges__() self.__get_frontiers__()
def _agglomerative_cluster_encounters(X_data, seconds_thresh): """ Agglomerative encounter clustering algorithm Input: Length N array of data to cluster Output: Length N array of cluster indexes """ label_arr = hier.fclusterdata(X_data, seconds_thresh, criterion='distance') return label_arr
def compute_encounters(hs, back, seconds_thresh=15): ''' clusters encounters togethers (by time, not space) An encounter is a meeting, localized in time and space between a camera and a group of animals. Animals are identified within each encounter. ''' if not 'seconds_thresh' in vars(): seconds_thresh = 15 gx_list = hs.get_valid_gxs() datetime_list = hs.gx2_exif(gx_list, tag='DateTime') unixtime_list = [io.exiftime_to_unixtime(datetime_str) for datetime_str in datetime_list] unixtime_list = np.array(unixtime_list) X = np.vstack([unixtime_list, np.zeros(len(unixtime_list))]).T print('[scripts] clustering') # Build a mapping from clusterxs to member gxs gx2_clusterid = fclusterdata(X, seconds_thresh, criterion='distance') clusterx2_gxs = [[] for _ in xrange(gx2_clusterid.max())] for gx, clusterx in enumerate(gx2_clusterid): clusterx2_gxs[clusterx - 1].append(gx) # IDS are 1 based clusterx2_nGxs = np.array(map(len, clusterx2_gxs)) print('cluster size stats: %s' % helpers.printable_mystats(clusterx2_nGxs)) # Change IDs such that higher number = more gxs gx2_ex = [None] * len(gx2_clusterid) gx2_eid = [None] * len(gx2_clusterid) ex2_clusterx = clusterx2_nGxs.argsort() ex2_gxs = [None] * len(ex2_clusterx) for ex in xrange(len(ex2_clusterx)): clusterx = ex2_clusterx[ex] gxs = clusterx2_gxs[clusterx] ex2_gxs[ex] = gxs for gx in gxs: nGx = len(gxs) USE_STRING_ID = True if USE_STRING_ID: # String ID eid = 'ex=%r_nGxs=%d' % (ex, nGx) else: # Float ID eid = ex + (nGx / 10 ** np.ceil(np.log(nGx) / np.log(10))) gx2_eid[gx] = eid gx2_ex[gx] = ex hs.tables.gx2_ex = np.array(gx2_ex) hs.tables.gx2_eid = np.array(gx2_eid) # Give info to GUI extra_cols = {'eid': lambda gx_list: [gx2_eid[gx] for gx in iter(gx_list)]} back.append_header('gxs', 'eid') back.populate_image_table(extra_cols=extra_cols) return locals()
def clust(fp_list): np_fps = [] for fp in fp_list: arr = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) thresh = 6.5 clusters = hcluster.fclusterdata(np_fps, thresh, criterion="distance") return clusters
def clusterization(data, clastersNum = 2): import scipy.cluster.hierarchy as hcluster #import pylab data = np.array(data) #clusters = hcluster.fclusterdata(np.transpose(data), 3, criterion='maxclust', metric='euclidean', depth=1) #clusters = hcluster.fclusterdata(data, 2, criterion='maxclust', metric='euclidean', depth=1) thresh = 1.5 clusters = hcluster.fclusterdata(data, thresh, criterion="distance") return np.array(clusters)
def clusterAndPlotAverages(distmatrix, labeldates, data, noOfClusters=0, cutoff=0, clustersize=0): '''runs hierarchical clustering on the given distance matrix using UPGMA and plots the clusters average days, set either noOfClusters or cutoff as keyword arguments and specify clustersize to plot only clusters with a minimum size''' if cutoff == 0: #method="average" == UPGMA clusters = hierarchy.fclusterdata(distmatrix, noOfClusters, criterion='maxclust', metric='euclidean', method='average') if noOfClusters == 0: clusters = hierarchy.fclusterdata(distmatrix, cutoff, criterion='distance', metric='euclidean', method='average') if noOfClusters == 0 and cutoff == 0: raise ValueError('Call clusterAndPlotAverages with specifying either cutoff or noOfClusters') #print clusters groupedDays = [] for i in range(max(clusters)): groupedDays.append([]) for i in range(len(clusters)): groupedDays[clusters[i]-1].append(i) for group in groupedDays: if len(group)> clustersize: averageday(data, group, labeldates)
def add_band_product_id_annotation(df, vdist=5.0): # maybe use fcluster instead of fclusterdata? - nope, fcluster(Z) takes a pre-calculated linkage matrix Z. # manually calculate Z? # ypos = [[ypos] for df.] # ypos = df.ypos[:, np.newaxis] product_clusters_ids = fclusterdata( df.ypos[:, np.newaxis], t=vdist, criterion='distance', metric='euclidean', depth=2, method='single' ) df['product_id'] = product_clusters_ids
def Counting_Clusters(col,row,tot): sizes = [] pixels = [[col[i],row[i]] for i,x in enumerate(col)] if(len(pixels)>1): results=fclusterdata(pixels,sqrt(2.),criterion="distance",method="single") y = numpy.bincount(results) ii = numpy.nonzero(y)[0] j = 0 previous = 0 for result, hit, TOT in zip(results,pixels,tot) : i = 0 if y[result]>1: if previous != result : while i <= y[result]-1: if j < len(results) : j+=1 i+=1 if j == len(results) : break else : sizes.append(y[result]) previous = result if y[result]==1: if j < len(results) : sizes.append(y[result]) j+=1 else : oneHitClusters = [[pixels[0][0],pixels[0][1],tot[0]]] return sizes
def display_hmax(): num_clusters = int(sys.argv[3]) # retrieve coordinates f = open(sys.argv[1], 'r') coords = f.read().splitlines() for i, coord in enumerate(coords): coords[i] = coord.split(' ') if len(coords[i]) < 3: coords[i].append(str(0)) # convert coordinates for coord in coords: coord[0] = int(coord[0]) coord[1] = int(coord[1]) coord[2] = int(coord[2]) features = np.array(coords) glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT) X = np.array(features, np.int32) clusters = fclusterdata(X, int(num_clusters), criterion='maxclust', metric='euclidean', method='complete') print clusters for i, feature in enumerate(features): if clusters[i] == 1: color = [1.0, 0.0, 0.0, .5] elif clusters[i] == 2: color = [0.0, 1.0, 0.0, .5] elif clusters[i] == 3: color = [0.0, 0.0, 1.0, .5] elif clusters[i] == 4: color = [1.0, 1.0, 0.0, .5] elif clusters[i] == 5: color = [0.0, 1.0, 1.0, .5] elif clusters[i] == 6: color = [1.0, 0.0, 1.0, .5] glPushMatrix() glMaterialfv(GL_FRONT,GL_DIFFUSE,color) glTranslatef(float(feature[0])/10, float(feature[1])/10, float(feature[2])/10) glutSolidSphere(.03,20,20) glPopMatrix() glutSwapBuffers() return
def generate_linkage_clusters(bags_of_words_file,t=0.,method='single',metric='braycurtis'): """Performs clustering on the bag of words listed in the input file. This function reads a list of bags of words and performs clustering using hierarchical clustering, but without using the metrics defined in this module. The bags of words are listed one per line in the input file. """ # first of all we need to read the vocabulary # we also count the lines voc = set() n_of_bags = 0 bags_of_words_file.seek(0) for line in bags_of_words_file: voc.update(line.split()) n_of_bags += 1 # this is the space inside which the multiset vectors live space = sorted(voc) # we create the numpy array that will store the bags vectors data = np.zeros((n_of_bags,len(space))) # now we store the bags as vectors in memory i = 0 bags_of_words_file.seek(0) for line in bags_of_words_file: m = Multiset(line.split()) data[i] = m.to_vector(space) i += 1 # now we can perform the clustering clusters = fclusterdata(data,t,metric=metric,method=method) # and we return a dict of clusters clusters_dict = dict() bags_of_words_file.seek(0) i = 0 for line in bags_of_words_file: line = line.strip() c = clusters[i] v = clusters_dict.get(c,[]) v.append(line) clusters_dict[c] = v i+=1 return clusters_dict return kmeans(data,k)
def run_cosine_clustering(self, method="greedy", th_clustering=0.55): if not hasattr(self, "topic_word"): raise ValueError("Thresholding not done yet.") # Swap the NaNs for zeros. Turn into a numpy array and grab the parent names data = self.docdf.fillna(0) data_array = np.array(data) peak_names = list(data.columns.values) # Create a matrix with the normalised values (each parent ion has magnitude 1) l = np.sqrt((data_array ** 2).sum(axis=0)) norm_data = np.divide(data_array, l) if method.lower() == "hierarchical": # scipy hierarchical clustering clustering = hierarchy.fclusterdata( norm_data.transpose(), th_clustering, criterion="distance", metric="euclidean", method="single" ) elif method.lower() == "greedy": # greedy cosine clustering cosine_sim = np.dot(norm_data.transpose(), norm_data) finished = False total_intensity = data_array.sum(axis=0) total_intensity = total_intensity n_features, n_parents = data_array.shape clustering = np.zeros((n_parents,), np.int) current_cluster = 1 thresh = th_clustering count = 0 while not finished: # Find the parent with the max intensity left current = np.argmax(total_intensity) total_intensity[current] = 0.0 count += 1 clustering[current] = current_cluster # Find other parents with cosine similarity over the threshold friends = np.where((cosine_sim[current, :] > thresh) * (total_intensity > 0.0))[0] clustering[friends] = current_cluster total_intensity[friends] = 0.0 # When points are clustered, their total_intensity is set zto zero. # If there is nothing left with zero, quit left = np.where(total_intensity > 0.0)[0] if len(left) == 0: finished = True current_cluster += 1 else: raise ValueError("Unknown clustering method") return peak_names, clustering
def clustering(prop, threshold): import scipy.cluster.hierarchy as hier log("info")("clustering start...") positions = prop[['x', 'y', 'z']].copy() print positions.values.shape log("info")("akka") cluster_idx = hier.fclusterdata(positions.values, threshold, criterion='distance') log("info")("ooover") prop['new_label'] = cluster_idx prop.set_index('new_label', drop=True, append=False, inplace=True) prop.index.name = 'label' prop = prop.sort_index() return prop
def detect(self, image): # define an 8-connected neighborhood neighborhood = generate_binary_structure(2, 2) # apply the local maximum filter; all pixel of maximal value # in their neighborhood are set to 1 local_max = maximum_filter(image, footprint=neighborhood) == image # local_max is a mask that contains the peaks we are # looking for, but also the background. # In order to isolate the peaks we must remove the background from the mask. # we create the mask of the background background = (image < self.min_th) # a little technicality: we must erode the background in order to # successfully subtract it form local_max, otherwise a line will # appear along the background border (artifact of the local maximum filter) eroded_background = binary_erosion(background, structure=neighborhood, border_value=1) # we obtain the final mask, containing only peaks, # by removing the background from the local_max mask (xor operation) detected_peaks = local_max ^ eroded_background detected_peaks[image < self.min_th] = False peaks = np.array(np.nonzero(detected_peaks)).T if len(peaks) == 0: return peaks, np.array([]) # nms if len(peaks) == 1: clusters = [0] else: clusters = fclusterdata(peaks, self.min_dist, criterion="distance") peak_groups = {} for ind_junc, ind_group in enumerate(clusters): if ind_group not in peak_groups.keys(): peak_groups[ind_group] = [] peak_groups[ind_group].append(peaks[ind_junc]) peaks_nms = [] peaks_score = [] for peak_group in peak_groups.values(): values = [image[y, x] for y, x in peak_group] ind_max = np.argmax(values) peaks_nms.append(peak_group[int(ind_max)]) peaks_score.append(values[int(ind_max)]) return np.float32(np.array(peaks_nms)), np.float32( np.array(peaks_score))
def cluster(res): data = res.values() clusters = hcluster.fclusterdata(data, thresh, metric=metricname) clustered = {} for (m, c) in zip(res.keys(), clusters): if c not in clustered: clustered[c] = [m] else: clustered[c].append(m) return clustered.values()
def SciPyClustering(self,col,row,tot,energyGC,energyPbPC): pixels = [[col[i],row[i]] for i,x in enumerate(col)] if(len(pixels)>1): result=fclusterdata(pixels,sqrt(2.),criterion="distance") clusters=[Cluster() for i in range(max(result))] [clusters[x-1].addPixel(col[j],row[j],tot[j],energyGC[j],energyPbPC[j]) for j,x in enumerate(result)] else: if(len(pixels)==1): c=Cluster() c.addPixel(col[0],row[0],tot[0],energyGC[0],energyPbPC[0]) clusters=[c] return clusters
def run(path_to_frames_directory): segment1 = Segment.Segment( path_to_frames_directory=path_to_frames_directory, sampling_rate=25) forward_dict = segment1.get_shots_forward() segment2 = Segment.Segment( path_to_frames_directory=path_to_frames_directory, sampling_rate=25) backward_dict = segment2.get_shots_backward() data = [] for frame in forward_dict: x = frame.get_frame_no() y = frame.get_correlation_val() coordinate = [x, y] data.append(coordinate) for frame in forward_dict: x = frame.get_frame_no() y = frame.get_correlation_val() coordinate = [x, y] data.append(coordinate) data_array = np.array(data) x = data_array[:, :1] y = data_array[:, 1:2] thresh = 25 clusters = hcluster.fclusterdata(data_array, thresh, criterion="distance") plt.scatter(*np.transpose(data_array), c=clusters) plt.show() cluster_dict = {} for i in range(clusters.max()): cluster_dict[str(i + 1)] = [] for i, val in enumerate(clusters): print(str(val)) temp_list = cluster_dict[str(val)] temp_list.append(i) cluster_dict[str(val)] = temp_list shot_boundaries = [] for key in cluster_dict.keys(): minimum_frame_index = data_array[cluster_dict[key][0]][0] minimum_cor_value = data_array[cluster_dict[key][0]][1] for val in cluster_dict[key]: if (minimum_cor_value > data_array[val][1]): minimum_frame_index = data_array[val][0] shot_boundaries.append(minimum_frame_index) shot_boundaries.sort() return shot_boundaries
def get_clustering_indexes(list_data, max_distance): clusters = fclusterdata(list_data, t=max_distance, criterion='distance') clusters_with_list_data = dict() for index, cluster_number in enumerate(clusters): if not clusters_with_list_data.get(cluster_number): clusters_with_list_data[cluster_number] = list() clusters_with_list_data[cluster_number].append(list_data[index]) clusters_to_indexes = dict() for index, (key, value) in enumerate(clusters_with_list_data.items()): clusters_to_indexes[key] = index indexes = [ clusters_to_indexes[cluster_number] for cluster_number in clusters ] return indexes
def MakeClusters(col, row, tot, tot_hd, hist0, hist1, hist2): oneHitClusters = [] allHitClusters = [] pixels = [[col[i], row[i]] for i, x in enumerate(col)] #for pixel in pixels : #hist1.Fill(pixel[0],pixel[1]) if len(pixels) > 1: results = fclusterdata(pixels, np.sqrt(2.), criterion="distance", method="single") y = np.bincount(results) # histogramm of cluster sizes ii = np.nonzero(y)[0] do_nothing = 0 j = 0 previous = 0 for result, hit, TOT, TOT_HD in zip(results, pixels, tot, tot_hd): tot_c = 0 tot_hd_c = 0 i = 0 # process multi hit clusters if y[result] > 1: if previous != result: while i <= y[result] - 1: if j < len(results): tot_c += tot[j] tot_hd_c += tot_hd[j] j += 1 i += 1 if j == len(results): break if tot_c != 0: allHitClusters.append([hit[0], hit[1], tot_c, tot_hd_c, y[result]]) hist0.Fill(y[result]) previous = result # process single hit clusters if y[result] == 1: if j < len(results): hist0.Fill(y[result]) oneHitClusters.append([hit[0], hit[1], TOT, TOT_HD, y[result]]) allHitClusters.append([hit[0], hit[1], TOT, TOT_HD, y[result]]) j += 1 if len(pixels) == 1: oneHitClusters = [[pixels[0][0], pixels[0][1], tot[0], tot_hd[0], 1]] return oneHitClusters, allHitClusters
def clustering(self, center_pt_list): pt_arr = np.asarray(center_pt_list) result = [] try: result = list(fclusterdata(pt_arr, self.clustering_th, 'distance')) except: pass number_of_groups = 0 groups = [] if result != []: groups = [] number_of_groups = max(result) for i in range(number_of_groups): groups.append([]) for i in range(len(result)): groups[result[i]-1].append(center_pt_list[i]) return number_of_groups, groups
def reduce_internal_clustered_transcripts(internal_grpd_transcripts, gene_id, max_cluster_gap): """Take a set of clustered transcripts and reduce them into a set of canonical transcripts, and associated sources. """ # if there is only a single trnascript, clustering doesnt make sense if len(internal_grpd_transcripts) == 1: new_t = copy.copy(internal_grpd_transcripts[0][0]) new_t.gene_id = gene_id new_t.id = new_t.gene_id + "_1" yield (new_t, [ internal_grpd_transcripts[0][0], ], [ internal_grpd_transcripts[0][1], ]) return # 2 transcripts are in the same cluster if both their 5' and 3' ends # are within 50 bp's of each other. Use the scipy cluster machinery # to do this for us transcript_ends = numpy.array([(t.exons[0][0], t.exons[-1][1]) for t, s in internal_grpd_transcripts]) cluster_indices = fclusterdata(transcript_ends, t=max_cluster_gap, criterion='distance', metric='chebyshev') # convert the incdices returned by flclusterdata into lists of transcript # source pairs clustered_transcript_grps = defaultdict(list) clustered_transcript_grp_sources = defaultdict(list) for cluster_index, ( trans, src ) in \ izip(cluster_indices, internal_grpd_transcripts): clustered_transcript_grps[cluster_index].append(trans) clustered_transcript_grp_sources[cluster_index].append(src) # finally, decide upon the 'canonical' transcript for each cluster, and # add it and it's sources for cluster_index in clustered_transcript_grps.keys(): clustered_transcripts = clustered_transcript_grps[cluster_index] clustered_transcripts_sources = clustered_transcript_grp_sources[ cluster_index] merged_transcript = build_merged_transcript(gene_id, clustered_transcripts) yield (merged_transcript, clustered_transcripts, clustered_transcripts_sources) return
def checkin_clustering(checkin, t, criterion, metric, method): ''' Clustering the checkin data by distance. Using fclusterdata method in scipy Args: checkin: pandas frame having at least two columns ['lat','lon'] t, criterion, metric, method are parameters of fclusterdata, see:https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fclusterdata.html Returns: checkin_clusters: dict{clusterno:[(lat0,lon0),(lat1,lon1),...]} cluster_num_center: dict{clusterno:(number of points in the cluster, array([center_lat,center_lon]))} ''' geopoint = np.array(checkin[['lat', 'lon']]) clusters = fclusterdata(geopoint, t, criterion=criterion, metric=metric, depth=1, method=method) checkin_clusters = dict() index = 0 for cluster_no in clusters: if cluster_no not in checkin_clusters.keys(): checkin_clusters[cluster_no] = [tuple(geopoint[index])] else: checkin_clusters[cluster_no].append(tuple(geopoint[index])) index = index + 1 cluster_number_center = dict() for cluster_no in clusters: cluster_number_center[cluster_no] = (len(checkin_clusters[cluster_no]), np.mean(np.array( checkin_clusters[cluster_no]), axis=0)) # xlim = # ylim = # plt.scatter(geopoint[:,0], geopoint[:,1], c = clusters) # plt.show() checkin_clusters_descending = OrderedDict( sorted(checkin_clusters.items(), key=lambda kv: len(kv[1]), reverse=True)) cluster_number_center_descending = OrderedDict( sorted(cluster_number_center.items(), key=lambda kv: kv[1][0], reverse=True)) return checkin_clusters_descending, cluster_number_center_descending
def main(dataNameList, featurelist_dir): # dataNameList = args.dataNameList # featurelist_dir = args.featlist_dir picSrcDir = '' for dataName in dataNameList: featlist = osp.join(featurelist_dir, '{0}_featlist.txt'.format(dataName)) print 'featlist is', featlist data, filePathList = multiprocess_feature_data_reader(featlist) saveName = dataName # clustering for thresh in np.arange(0.5, 0.9, 0.02): print thresh clusters = hcluster.fclusterdata(data, thresh, metric="cosine", method='average', criterion="distance") print 'The number of clustered label is:', np.amax(clusters) print clusters label_result = {} for label in set(clusters): label_num = np.sum(clusters == label) label_result[label] = label_num for i in range(len(filePathList)): #picName = filePathList[i].split('/')[-1].replace('.npy', '') picName = filePathList[i].replace('.npy', '') srcPicPath = picSrcDir + picName lost_clustered_label = 'result_{0}/result_{1}/0'.format( saveName, thresh) if label_result[clusters[i]] != 1: try: save_path = 'result_{0}/result_{1}/{2}'.format( saveName, thresh, clusters[i]) os.makedirs(save_path) except: pass shutil.copy(srcPicPath, save_path) else: if not osp.exists(lost_clustered_label): os.makedirs(lost_clustered_label) shutil.copy(srcPicPath, lost_clustered_label)
def getClusters(segs, db, subject='a1', thresh=20.0, plot=True): clusters = fclusterdata([[s] for s in segs], thresh, criterion="distance") if plot: rawData = db.data[subject] plt.figure(figsize=(11,9)) plt.plot(rawData[:,0],rawData[:,2:]) colours = cm.rainbow(np.linspace(0, 1, len(set(clusters)))) for i,s in enumerate(segs): plt.axvline(rawData[:,0][s],color=colours[clusters[i]-1],linewidth=2) plt.title("Clustered Neural Network Segments") plt.xlabel("Time (Seconds)") plt.ylabel("Rotation (Degrees per second)") #savefig('/Users/robertevans/Desktop/Gait analysis graphs/Segment clustering/'+name+'.pdf', format='pdf') plt.show() return clusters
def cluster_weights_agglo(weight, threshold, average=True,cosine=True,euclidean=False,chebyshev=False,manhattan=False): t0 = time.time() weight = weight.T weight = normalize(weight, norm='l2', axis=1) threshold = 1.0-threshold # Conversion to distance measure if cosine==True: clusters = hcluster.fclusterdata(weight, threshold, criterion="distance", metric='cosine', depth=1, method='centroid') z = hac.linkage(weight, metric='cosine', method='complete') elif euclidean==True: clusters = hcluster.fclusterdata(weight, threshold, criterion="distance", metric='euclidean', depth=1, method='centroid') z = hac.linkage(weight, metric='euclidean', method='complete') elif chebyshev==True: clusters = hcluster.fclusterdata(weight, threshold, criterion="distance", metric='chebyshev', depth=1, method='centroid') z = hac.linkage(weight, metric='chebyshev', method='complete') elif manhattan==True: clusters = hcluster.fclusterdata(weight, threshold, criterion="distance", metric='cityblock', depth=1, method='centroid') z = hac.linkage(weight, metric='cityblock', method='complete') labels = hac.fcluster(z, threshold, criterion="distance") labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #print(n_clusters_) elapsed_time = time.time() - t0 # print(elapsed_time) a=np.array(labels) sort_idx = np.argsort(a) a_sorted = a[sort_idx] unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1])) unq_items = a_sorted[unq_first] unq_count = np.diff(np.nonzero(unq_first)[0]) unq_idx= np.split(sort_idx, np.cumsum(unq_count)) first_ele = [unq_idx[idx][-1] for idx in range(len(unq_idx))] return n_clusters_, first_ele
def cluster_lines(self, positions, threshold=20): """ :param positions: list of positions [y,x] for points on each line :param threshold: max distance allowed for distance between points in a cluster :return: list of labels of which cluster each line belongs to """ clusters = hcluster.fclusterdata(np.array(positions), threshold, criterion="distance") n_clusters = len(set(clusters)) print("Number of clusters: {}".format(n_clusters)) if n_clusters < 2: self.good_quality = False return clusters
def find_clusters_1d_hierarchical(vals, t, **kwargs): """ Find clusters in <vals> using hierarchical clustering with parameter <t>. Further parameters need to be passed via <kwargs>. Uses *fclusterdata* from *scipy.cluster.hierarchy*. """ from scipy.cluster.hierarchy import fclusterdata data = vals.reshape((len(vals), 1)) ind = fclusterdata(data, t, **kwargs) clusters = [np.where(ind == c_id)[0] for c_id in np.unique(ind)] assert len(vals) == sum(map(len, clusters)) return clusters
def buildClusters(self, th=0.05, min_cluster_size=0): self.labels = fclusterdata(self.points, th, criterion='distance') for i in range(0, len(self.labels)): label = self.labels[i] if label not in self.points_map: self.points_map[label] = [] self.voxels_map[label] = [] self.points_map[label].append(self.points[i]) self.voxels_map[label].append(self.voxels[i]) for l, points in self.points_map.iteritems(): if len(points) >= min_cluster_size: self.objects_map[l] = Instance(points, self.voxels_map[l], l) return self.labels
def merge_paths(rides): waypoints = list(itertools.chain(*[ride.route.waypoints for ride in rides])) waypoints = sorted(waypoints, key=lambda x: x.country) logger.info("Merging {} rides with {} total waypoints".format( len(rides), len(waypoints))) for country, group in itertools.groupby(waypoints, key=lambda x: x.country): waypoints = list(group) country_lat_lng_points = [(x.lat, x.lng) for x in waypoints] country_xyz_points = [ latlng_to_xyz(lat, lng) for lat, lng in country_lat_lng_points ] logger.debug("Processing {} with {} waypoints".format( country, len(country_xyz_points))) wh = whiten(country_xyz_points) k_guess = max(1, len(country_xyz_points) / BEARABLE_CLUSTER_SIZE) k_centroids = kmeans(wh, k_guess)[0] k_labels = vq(wh, k_centroids)[0] k_labeled = sorted(zip(country_xyz_points, country_lat_lng_points, waypoints, k_labels), key=lambda x: x[3]) logger.debug("Got {} miniclusters".format(len(k_centroids))) for key, gr in itertools.groupby(k_labeled, key=lambda x: x[3]): gr = list(gr) k_waypoints = [x[2] for x in gr] k_lat_lng_points = [x[1] for x in gr] k_xyz_points = [x[0] for x in gr] logger.debug("Running {} minicluster with {} waypoints".format( key, len(k_waypoints))) cluster_labels = fclusterdata(np.array(k_xyz_points), 0.2, criterion="distance", metric="euclidean") centroids = cluster_centroids(zip(k_lat_lng_points, cluster_labels)) logger.debug("Got {} hierarhical clusters".format( len(set(cluster_labels)))) for i in range(0, len(k_waypoints)): new_lat, new_lng = centroids[cluster_labels[i] - 1] k_waypoints[i].lat = new_lat k_waypoints[i].lng = new_lng
def identify_center_band(energies, k): """ Tries to identify the bounds of the central energy band (around the Fermi energy) using hierarchical clustering. Inputs: energies = sorted energy levels k = assumed number of distinct bands Outputs: central_band = energies in the identified central band """ E = np.copy(energies) E.resize((E.shape[0], 1)) # Transpose labels = fclusterdata(E, k, criterion='maxclust') # Clustering # Find the levels closest to the Fermi level return energies[np.where(labels == labels[len(energies) // 2])]
def hcluster_filter(self, v, t): X = np.array(zip(v,t)) T = hcluster.fclusterdata(X, 0.1) lens = {} best = [] w, h = X.shape for idx, clno in enumerate(T): lens.setdefault(clno, 0) lens[clno] += 1 a = list(lens.values()) b = list(lens.keys()) clmax = b[a.index(max(a))] best = [idx for idx, clno in enumerate(T) if clno == clmax] v = v[best] t = t[best] return v, t
def precluster(people, ngo): print("in precluster!!!!") for p in people: p.append(1) for n in ngo: n.append(0) data = people newngo = [] datanp = np.zeros((len(data), 2)) for i, d in enumerate(data): if len(d) == 5: datanp[i, 0] = d[1] datanp[i, 1] = d[2] if len(data) > 1: cluster = fclusterdata(datanp, 1) else: cluster = [0] for i, d in enumerate(data): d.append(cluster[i]) data.sort(key=lambda x: x[5]) curr = data[0][5] i = 0 while i < len(data): temp = [] while i < len(data) and data[i][5] == curr: temp.append(data[i]) i += 1 if i < len(data): curr = data[i][5] cenlat = 0 cenlon = 0 for t in temp: cenlat += t[1] / len(temp) cenlon += t[2] / len(temp) minindex = 0 min = ngo[0] for n in ngo: if haversine(n[5], n[4], cenlon, cenlat) < haversine( min[5], min[4], cenlon, cenlat): min = n for t in temp: print("inserting!!!!!!") db.insertngocurr(min[0], str(t[1]), str(t[2]), t[0]) db.commit() db.insertrescuengo(t[0], min[0]) db.commit()
def reduce_internal_clustered_transcripts( internal_grpd_transcripts, gene_id, max_cluster_gap ): """Take a set of clustered transcripts and reduce them into a set of canonical transcripts, and associated sources. """ # if there is only a single trnascript, clustering doesnt make sense if len( internal_grpd_transcripts ) == 1: new_t = copy.copy(internal_grpd_transcripts[0][0]) new_t.gene_id = gene_id new_t.id = new_t.gene_id + "_1" yield ( new_t, [internal_grpd_transcripts[0][0],], [internal_grpd_transcripts[0][1],] ) return # 2 transcripts are in the same cluster if both their 5' and 3' ends # are within 50 bp's of each other. Use the scipy cluster machinery # to do this for us transcript_ends = numpy.array( [(t.exons[0][0], t.exons[-1][1]) for t, s in internal_grpd_transcripts]) cluster_indices = fclusterdata( transcript_ends, t=max_cluster_gap, criterion='distance', metric='chebyshev' ) # convert the incdices returned by flclusterdata into lists of transcript # source pairs clustered_transcript_grps = defaultdict( list ) clustered_transcript_grp_sources = defaultdict( list ) for cluster_index, ( trans, src ) in \ izip(cluster_indices, internal_grpd_transcripts): clustered_transcript_grps[cluster_index].append( trans ) clustered_transcript_grp_sources[cluster_index].append( src ) # finally, decide upon the 'canonical' transcript for each cluster, and # add it and it's sources for cluster_index in clustered_transcript_grps.keys(): clustered_transcripts = clustered_transcript_grps[cluster_index] clustered_transcripts_sources = clustered_transcript_grp_sources[ cluster_index] merged_transcript = build_merged_transcript( gene_id, clustered_transcripts) yield ( merged_transcript, clustered_transcripts, clustered_transcripts_sources) return
def group_measures(self, measure_filter): """ Groups the measures that are the result of measure_filter """ measures = measure_filter.all() data = np.array([self._key_fn(m) for m in measures]) npdata = np.reshape(np.array(data), [len(data), 1]) clusters = hierarchy.fclusterdata(npdata, **self._clustering_args) grouped = {} for i, cluster in enumerate(clusters): current = grouped.get(cluster, []) current.append(measures[i]) grouped[cluster] = current return grouped
def cluster_docs(texts, min_clusters_per_leaf=3, hierarchy=True): """ Function for clustering texts. Parameters: texts (scipy.sparse.csr_matrix): vector implementation for all text in sparse matrix min_clusters_per_leaf (int): minimum number of samples per cluster hierarchy (bool): use hierarchy clustering algorithm Returns: clusters (list): list of clusters for texts """ thresh = 1 # empirical diameter for samples of one cluster if hierarchy: clusters = hcluster.fclusterdata(texts.todense(), thresh, criterion="distance") else: clusters = DBSCAN(eps=1, min_samples=min_clusters_per_leaf).fit(texts) return clusters
def SciPyClustering(self, col, row, tot): pixels = [[col[i], row[i]] for i, x in enumerate(col)] if (len(pixels) > 1): result = fclusterdata(pixels, sqrt(2.), criterion="distance") clusters = [Cluster() for i in range(max(result))] [ clusters[x - 1].addPixel(col[j], row[j], tot[j]) for j, x in enumerate(result) ] else: if (len(pixels) == 1): c = Cluster() c.addPixel(col[0], row[0], tot[0]) clusters = [c] print len(clusters) return clusters
def get_clusters(self, X, n_clusters=None): """ Clusters a set of points and returns the indices of the points within each cluster. :param X: An (N, D) tensor representing N points in D dimensions :param n_clusters: The number of clusters to use for KMeans, or None to use hierarchical clustering and automatically determine the number of clusters. :returns: cluster_indices, a list of lists of indices """ if n_clusters is None: cluster_labels = hcluster.fclusterdata(X, 1) print("Hierarchical clustering returned {} clusters".format(len(set(cluster_labels)))) else: km = KMeans(n_clusters=n_clusters) km.fit(X) cluster_labels = km.labels_ cluster_indices = [ np.nonzero(cluster_labels == label)[0] for label in set(cluster_labels) ] return cluster_indices
def _make_clusters(self, matrix, num_clusters_per_roi, metric): """clusters a given matrix by into specified number of clusters according to given metric""" from scipy.cluster.hierarchy import fclusterdata # maxclust needed to ensure t is interpreted as # clusters in heirarchical clustering group_ids = fclusterdata(matrix, metric=metric, t=num_clusters_per_roi, criterion='maxclust') group_set = np.unique(group_ids) clusters = [ self._summary_func(matrix[group_ids == group, :], axis=0, keepdims=True) for group in group_set ] return np.vstack(clusters).squeeze()
def Clustering(self): nodes = glob.allPoint.values() points = [[_.x, _.y] for _ in nodes] threshold = 450 clusters = hcluster.fclusterdata(points, threshold, criterion="distance") for i in range(len(nodes)): clusterID = clusters[i] nodes[i].cluster = clusterID if clusterID not in glob.allCluster: glob.allCluster[clusterID] = Cluster(random.randint(100, 255), random.randint(100, 255), random.randint(100, 255)) c = glob.allCluster[clusterID] c.points.add(nodes[i]) #nodes[i].color = c.color for cluster in glob.allCluster.values(): cluster.renewAttribute()
def findLines(img, threshold=10, plot=True): imgBlur = cv2.GaussianBlur(img, (5, 5), 0) imgT = cv2.threshold(imgBlur, 70, 255, cv2.THRESH_BINARY)[1] lines = cv2.HoughLines(imgT, 1, np.pi / 180, 200) lines = np.squeeze(lines) lines = lines[lines[:, 1] > 0.5] """fig, ax = plt.subplots() ax.imshow(imgT, cmap="gray") for rho,theta in lines: a = np.cos(theta) b = np.sin(theta) x0 = a*rho y0 = b*rho x1 = int(x0 + 1000*(-b)) y1 = int(y0 + 1000*(a)) x2 = int(x0 - 1000*(-b)) y2 = int(y0 - 1000*(a)) ax.plot([x1,x2], [y1,y2], 'm-', lw=2) rows, cols = img.shape[0], img.shape[1] ax.axis((0, cols, rows, 0)) plt.show() """ clusters = hcluster.fclusterdata(lines, threshold, criterion="distance") n_clusters = len(set(clusters)) if plot: plt.scatter(*np.transpose(lines), c=clusters) title = "threshold: %f, number of clusters: %d" % (threshold, n_clusters) plt.title(title) plt.show() mean_lines = [[] for i in range(n_clusters)] for i in xrange(0, len(lines)): mean_lines[clusters[i] - 1].append(list(lines[i])) return np.array([ np.mean(np.array(zip(*mean_lines[i])), axis=1) for i in xrange(len(mean_lines)) ])
def detect_grid(coordinates): """ Check if sample points form regular, rectangular grid :param coordinates: :return: (xs, ys, zs) axes of grid """ dtype = coordinates.dtype coord_round = coordinates.round(decimals=6) tol = {'rtol': 0, 'atol': 1e-5} axes = [] # clustering for coord_dim in coord_round.T: # pre-clustering (not really unique due to float + rounding ridges) xs = np.unique(coord_dim) # hierarchical clustering xc = hcluster.fclusterdata(xs[:, np.newaxis], 1e-5, criterion="distance") _, xu_idx = np.unique(xc, return_index=True) xs = sorted(xs[xu_idx]) xs_step = np.diff(xs) assert np.allclose(xs_step, np.median(xs_step), **tol), "xs_step" axes.append(xs) # assumption: fraction coords were laid out on regular, rectangular # grid parallel to axes # test: g_min = np.min(coordinates, axis=0) g_max = np.max(coordinates, axis=0) axes_grid = [] for dim_min, dim_max, xs in zip(g_min.T, g_max.T, axes): xs_grid = np.linspace(dim_min, dim_max, len(xs), dtype=dtype) assert np.allclose(xs, xs_grid, **tol), "xs" axes_grid.append(xs_grid) return axes_grid
def cluster(self, participants: List['BaseParticipant'], server: BaseParticipant) -> Dict[str, List['BaseParticipant']]: model_predictions = np.array([self.predict(p) for p in participants]) cluster_ids = hac.fclusterdata(model_predictions, self.max_value_criterion, self.criterion, method=self.linkage_mech, metric=self.dis_metric) num_cluster = max(cluster_ids) # Allocate participants to clusters i = 0 clusters_hac_dic = {} for id in range(1, num_cluster + 1): clusters_hac_dic[str(id)] = [] for participant in participants: participant.cluster_id = str(cluster_ids[i]) clusters_hac_dic[participant.cluster_id].append(participant) i += 1 return clusters_hac_dic
def process_list_update(self): #print(self.ls) if (len(self.ls) > 3): #Do clustering data = [] for item in self.ls: data.append(item[0:2]) clusters = hcluster.fclusterdata(data, self.threshold, criterion="distance") self.cluster_points = range(max(clusters)) for c in range(max(clusters)): temp = [x == c + 1 for x in clusters] self.cluster_points[c] = [ item for item, con in zip(self.ls, temp) if con == True ] self.check_for_new_batteries(self.cluster_points[c]) #print(self.cluster_points) #plt.scatter(*np.transpose(data), c=clusters) #plt.show() print("Finished Clustering")
def get_baseline_segs(self): """ Get baseline segments, given the baseline value """ yDown, yUp, stripeNum, noiseStripeNum =self._yDown, self._yUp,\ self.stripeNum, self.noiseStripeNum assert stripeNum > 0 rdRaioLog = [] # here should keep idx ycV = np.array([ np.log(seg.tReadNum + 1) - np.log(seg.nReadNum + 1) for seg in self.segPool.segments ]) # 记录是否是outlier statusYcV = np.logical_and(ycV > yDown, ycV < yUp) ycV = ycV[statusYcV] yFcd = ycV.reshape(ycV.shape[0], 1) clusters = hierarchy.fclusterdata( yFcd, stripeNum + noiseStripeNum, criterion="maxclust", method="complete") _, blSegL = self.__get_baseline_from_stripe(clusters, ycV, statusBoolL = statusYcV) # writeToFile(self,clusters) # debug ycVBL = np.array([ np.log(seg.tReadNum + 1) - np.log(seg.nReadNum + 1) for seg in blSegL]) statusYcVBL = np.logical_or(ycVBL <= yDown, ycVBL >= yUp) if sum(statusYcVBL) > 0: print "baseline segment is not correct" return blSegL
def cluster_hierarchical(self, cluster_count, modifier=1): self.clustering_name = "hierarchical clustering " + str( time.process_time()) distance_measure_data_values = self._similarity_measure_data_pre_processing( ) metric = self.get_degree_and_euclidean_distance_metric(modifier) fclust = fclusterdata(distance_measure_data_values, t=cluster_count, criterion='maxclust', metric=metric) bp_list = self.data_frame.get_border_points_point_only_df( ).index.tolist() bp_clus = list(zip(bp_list, fclust)) self.data_frame.add_result_name(self.clustering_name, -1, ColType.CLUSTER_LABEL) for ind, clus in bp_clus: self.data_frame.add_result(self.clustering_name, ind, clus) self._assign_noise(self.clustering_name) self._assign_inner_points(self.clustering_name) self.cluster_count = len( set(self.data_frame.df[self.clustering_name].tolist())) self.clustering_result = self.data_frame.df[ self.clustering_name].tolist() # TODO If ever needed Dendrogram code: # link = linkage(distance_measure_data_values, 'single', metric=self.degree_and_euclidean_distance_metric) # fig = plt.figure(figsize=(25, 10)) # dn = dendrogram(link) # plt.show() return self.clustering_name
def calc_clusters(self, tol=0.02, report=False): #We use the transpose of the training matrix in order to cluster features on their pairwise correlation throughout the dataset. try: clusters = hcluster.fclusterdata(self.nArray.T, tol, criterion='distance', metric='correlation', method='average') except ValueError: Z = hcluster.linkage(self.nArray.T, method='average', metric='correlation') np.clip(Z, 0, 10000, out=Z) clusters = hcluster.fcluster(Z, tol, criterion='distance') clusterDict = {x: [] for x in list(set(clusters))} for i, x in enumerate(clusters): clusterDict[x].append(i) goodBitList = [] for k, v in clusterDict.iteritems(): if len(v) > 1: goodBitList.append(choice(v)) elif len(v) == 1: goodBitList.append(v[0]) else: continue goodBitList.sort() goodBits = [self.nArray[:, x] for x in goodBitList] self.nArray = np.vstack(tuple(goodBits)).T if len(self.bitAddresses) == 0: self.bitAddresses = {k: v for k, v in zip(range(self.nArray.shape[1]), goodBitList)} else: updateGoodBits = [self.bitAddresses[x] for x in goodBitList] self.bitAddresses = {k: v for k, v in zip(range(self.nArray.shape[1]), sorted(updateGoodBits))} if report: return self.nArray.shape, self.bitAddresses
def fusion_cluster(bboxes, method='cluster', distance_threshold=0.1, nms_threshold=0.3): ''' bboxes: bboxes to be fused method: 'cluster' or 'nms' ''' if bboxes.shape[0] <= 1: return Disjoint(), bboxes if method == 'cluster': clusters = hcluster.fclusterdata(bboxes, distance_threshold, criterion="distance", depth=2) print('clusters', clusters) cluster_num = len(set(clusters)) cluster_set_data = [[] for _ in range(cluster_num)] for i, cluster_id in enumerate(clusters): cluster_set_data[cluster_id - 1].append(i) cluster_count = np.zeros((cluster_num, 1), dtype=np.int32) cluster_set = Disjoint() cluster_set.sets = cluster_set_data fused_bboxes = np.zeros((cluster_num, bboxes.shape[1])) for k in range(bboxes.shape[0]): fused_bboxes[clusters[k] - 1] += bboxes[k] cluster_count[clusters[k] - 1] += 1 fused_bboxes /= cluster_count # print(cluster_count) # print('fused_bboxes', fused_bboxes.shape) # print(np.where(cluster_count>1)) # fused_bboxes = fused_bboxes[np.where(cluster_count>1)[0]] return cluster_set, fused_bboxes # (n, 4) elif method == 'nms': return nms(bboxes, nms_threshold) else: logging.error(method + 'Not Implement yet') raise
def _detect_cluster(self): all_para = [] for i in self.pages: all_para.extend(self.pages[i]["para"]) features = [] for item in all_para: try: r, b, g = item["ncolour"][1:-1].split(",") if item[ "ncolour"] != 'None' and item["ncolour"] != '0' else [ 0, 0, 0 ] except: r, b, g = [0, 0, 0] bold = 1 if "bold" in item["font"].lower() else 0 features.append( [float(item["size"]) / 6, float(r), float(b), float(g), bold]) thres = 0.1 clusters = hcluster.fclusterdata(np.array(features), thres, criterion="distance") cluster_text = defaultdict(list) for cluster_id, para in zip(clusters, all_para): cluster_text[int(cluster_id)].append(para["text"]) self.cluster_text = cluster_text count = 0 for i in self.pages: for para_idx, _ in enumerate(self.pages[i]["para"]): self.pages[i]["para"][para_idx]["cluster_id"] = clusters[count] count += 1
def getCentroids(self): centroids = {} for i in range(int(self.utils.initial_cluster_size), int(self.utils.max_cluster_size)+1): if self.memory[str(i)]['arrayMeas'] != None: self.my_logger.debug("GETCENTROIDS state "+ str(i) +" measurements : "+ str(self.memory[str(i)]['arrayMeas'])) if len(self.memory[str(i)]['arrayMeas']) > 1: # Y = pdist(self.memory[str(i)]['arrayMeas'], 'seuclidean') Y = self.memory[str(i)]['arrayMeas'] # Z = centroid(Y) # Z = linkage(Y, 'single') # single, complete, average, weighted, median centroid, ward # T = fcluster(Z, t=1.0, criterion='distance') T= fclusterdata(self.memory[str(i)]['arrayMeas'], t=15.0, criterion='distance', metric='euclidean', method='single') # self.my_logger.debug("GETCENTROIDS state "+ str(i) +" centroids: "+ str(Z)) self.my_logger.debug("GETCENTROIDS state "+ str(i) +" clusters: "+ str(T)) Z = centroid(Y) self.my_logger.debug("GETCENTROIDS state "+ str(i) +" centroid func: "+ str(Z)) else: centroids[str(i)] = {} centroids[str(i)]['throughput'] = self.memory[str(i)]['arrayMeas'][0][0] centroids[str(i)]['latency'] = self.memory[str(i)]['arrayMeas'][0][1] self.my_logger.debug("GETCENTROIDS centroids: "+ str(centroids)) return centroids
def plot(data, step): fig, ax = plt.subplots() # clustering threshold = 1.2 clusters = hierarchy.fclusterdata(data, threshold, criterion="distance", metric=periodic_distance) # plotting ax.scatter(*np.transpose(data), c=clusters) #ax.set_xlim(0.0, 20.0) ax.axis("equal") title = "threshold: %f, number of clusters: %d, step: %d" % ( threshold, len(set(clusters)), step) plt.title(title) fig.tight_layout() filename = out.replace(".png", "&step=%d.png" % (step)) plt.savefig(filename) return len(set(clusters))