def dbscanV2( display ): pixarr = get_display_matrix(display) arr = numpy.array(pixarr) pt_locations = numpy.where( arr == 0) points = [] for i in range(len(pt_locations[0])): points.append( (pt_locations[0][i], pt_locations[1][i]) ) cdbscan.dbscan( points, 5, 10 )
def dbscanV2(display): pixarr = get_display_matrix(display) arr = numpy.array(pixarr) pt_locations = numpy.where(arr == 0) points = [] for i in range(len(pt_locations[0])): points.append((pt_locations[0][i], pt_locations[1][i])) cdbscan.dbscan(points, 5, 10)
def cluster_stay_points(all_spts): print('Clustering preprocessing has been started') all_lats = [] all_longs = [] all_cords = [] for point in all_spts: all_lats.append(point[0]) all_longs.append(point[1]) all_cords.append(all_lats) all_cords.append(all_longs) dbscan(all_cords, 200, 5)
def use_dbscan(x_pca, y): data = x2nodes(x_pca) dbscan(data, eps=3.1, min_points=6) y_pred = [] for i in data: y_pred.append(i.label) score = get_score(y, y_pred) print(score) title = 'dbscan-mnist-socre-' + str(score) db_show(data, title)
def dbscan_clustering(filename, epsilon, min_pts): sample = read_data(filename) dbscan_instance = dbscan(sample, epsilon, min_pts) dbscan_instance.process() clusters = dbscan_instance.get_clusters() outliers = dbscan_instance.get_outliers() print("Clusters :\n", clusters) print("Outliers :\n", outliers)
def data(): resp = "{}" headers = dict() last = 0 sensors = [fetch.SENSOR_1_NAME, fetch.SENSOR_2_NAME] for fn in sensors: lu = fetch.lastupdate(fn) if not lu: continue if not last: last = lu else: last = max(last, lu) lm = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(last)) headers['Last-Modified'] = lm ims = bottle.request.environ.get('HTTP_IF_MODIFIED_SINCE') if ims: ims = bottle.parse_date(ims.split(";")[0].strip()) if ims is not None and ims >= last: headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) return bottle.HTTPResponse(status=304, **headers) resp = {} # ensure we match readouts for fn in sensors: data = fetch.rrdfetch(fn) if "time" not in resp: resp["time"] = data["time"] resp[fn] = data # does time diverge? time1 = set(resp[sensors[0]]["time"]) time2 = set(resp[sensors[1]]["time"]) for fn in sensors: sensordata = resp[fn] sensordata.pop("time") # Remove outliers for name in sensordata.keys(): data = sensordata[name] if name == "volt": eps = 0.01 else: eps = 2 outliers = dbscan(data, eps, 3) for i in outliers: data[i] = None # todo: time might diverge +/- one tick #if time1 != time2: # print(time1-time2, time2-time1) # return bottle.HTTPResponse(status=500) resp = json.dumps(resp).encode("utf-8") headers['Content-Length'] = len(resp) return bottle.HTTPResponse(resp, **headers)
def dbscanV2( display, eps, threshold_num ): pixarr = get_display_matrix(display) arr = numpy.array(pixarr) pt_locations = numpy.where( arr == 0) points = [] for i in range(len(pt_locations[0])): points.append( (pt_locations[0][i], pt_locations[1][i])) clusters = cdbscan.dbscan( points, threshold_num, eps ) # for cluster in clusters: # print cluster return clusters
def streamProc(folderName): mkdirSafe(join(folderName, "snapshot")) mkdirSafe(join(folderName, "clusters")) mkdirSafe(join(folderName, "pattern")) mkdirSafe(join(folderName, "img")) snapshotFolder = join(folderName, "snapshot") clusterFolder = join(folderName, "clusters") patternFolder = join(folderName, "pattern") imgFolder = join(folderName, "img") length = 150 interpolate.takeSnapshot(folderName, snapshotFolder) dbscan.dbscan(snapshotFolder, clusterFolder) intersect.Intersect(clusterFolder, patternFolder, frameLength=length).intersect() showLinePattern.RenderPattern(patternFolder, folderName, imgFolder, "./frameImg/", frameLength=length).renderPattern() shutil.move(imgFolder, join("output", folderName.replace('/','')[5:])) shutil.rmtree(snapshotFolder) shutil.rmtree(clusterFolder) shutil.rmtree(patternFolder)
def dbscanV2(display, eps, threshold_num): pixarr = get_display_matrix(display) arr = numpy.array(pixarr) pt_locations = numpy.where(arr == 0) points = [] # the weird indexing is not an accident. See # the documentation for numpy.where for i in range(len(pt_locations[0])): points.append((pt_locations[0][i], pt_locations[1][i])) clusters = cdbscan.dbscan(points, threshold_num, eps) return clusters
def main(): n = 5000 (X, y) = generator(n) X = X.astype('float32') y = y.astype('int32') y = y.reshape(1, n) show_result(kmeans(X, y, 2)) show_result(spectralclustering(X, y, 2)) show_result(agglomerativeclustering(X, y, 2)) show_result(dbscan(X, y, 0.002, 2)) show_result(birch(X, y, 2)) show_result(minibatchkmeans(X, y, 2))
def run(filename): data = get_data_set(filename) assignments = dbscan.dbscan(data, 0.92, 5) fileout = open('result/dbscan-' + filename[7:-4] + '.txt', 'w') dataset = data.T for i in range(len(assignments)): fileout.write( str(dataset[i][0]) + ',' + str(dataset[i][1]) + ',' + str(assignments[i]) + '\n') print(' ')
def get_clusters(good_urls, urls, n_urls=500, my_dbs=False, verbose=False): """ """ random.shuffle(good_urls) random.shuffle(urls) fit_urls = good_urls[:n_urls] + urls[:n_urls] mysekitei = sekitei(fit_urls, alpha=0.01) mysekitei.fit() X = mysekitei.most_freq_features() if my_dbs: py = dbscan().fit_predict(X) else: py = DBSCAN().fit_predict(X) hist = [] clusters = list( set(py) ) with open('data/clusters_features.txt', 'w') as file: print >>file, mysekitei.n_features print >>file, '\n\n\n', '\n'.join(mysekitei.tags_order[:mysekitei.n_features]), '\n\n' for c in clusters: hist.append(len([p for p in py if p == c])) # print >>f, c, ':', hist[-1] vizualize_clusters(X, ([1] * n_urls + [0] * n_urls), py, hist) regexpes = mysekitei.get_clusters_regexpes(X, py) with open('data/clusters_freq_features.txt', 'w') as file: print 'n_features=', mysekitei.n_features, '\n\n' print >>file, 'n_features=', mysekitei.n_features, '\n\n' for c,f,i in regexpes: print '---', c, '=', str(len(f)) print '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n' print >>file, '---', c, '=', str(len(f)) print >>file, '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n' with open('data/united_regexpes.txt', 'w') as file: for k,f,i in regexpes: rex = '^' for r in f[:-1]: rex += '(?=%s)' % r.strip('^').rstrip('$') rex += '%s' % f[-1].strip('^') print >>file, k, '=', rex return mysekitei, regexpes
def __division_loc(self): # 计算每一个对象的类别号 category_loc = dbscan(self.__historical_data, 1.5, 3, FingerprintSequence.distance_loc) category_loc_num = int(np.max(category_loc + 1)) if category_loc_num == 1: return False ''' print(category_loc) print(category_loc_num) for i in range(len(self.__historical_data)): t = self.__historical_data[i] print(t.uploader, ' ', i, ' ', category_loc[i]) pprint(t.feature_location) pprint(t.feature_avg) pprint(t.feature_std) pprint(t.feature_max) ''' self.__category_loc_data = [0] * category_loc_num # 统计每一类数据数目 category_loc_data_num = [0] * category_loc_num counts_loc = [0] * category_loc_num for i in range(len(category_loc)): category_loc_data_num[int(category_loc[i])] += 1 # 初始化每一类数据的列表 for i in range(category_loc_num): self.__category_loc_data[i] = [0] * category_loc_data_num[i] # 为每一位置类添加数据 __category_loc_data[i][j]表示第i个位置类中的第j个数据 for i in range(category_loc.shape[0]): self.__category_loc_data[int(category_loc[i])][int(counts_loc[int(category_loc[i])])] = \ self.__historical_data[i] counts_loc[int(category_loc[i])] += 1 ''' for i in range(len(self.__category_loc_data)): # print(len(self.__category_loc_data[i])) for j in range(len(self.__category_loc_data[i])): t = self.__category_loc_data[i][j] print(t.uploader, ' ', i) pprint(t.feature_location) pprint(t.feature_avg) pprint(t.feature_std) pprint(t.feature_max) ''' # print(counts_loc) return True
def get_vectors(image, eps, threshold): pil_image = image image_array = numpy.array(pil_image) pt_locations = numpy.where(image_array != 0) points = [] for i in range(len(pt_locations[0])): points.append((pt_locations[0][i], pt_locations[1][i])) clusters = cdbscan.dbscan(points, threshold, eps) images = [] image_vectors = [] i = 0 for cluster in clusters: images.append(cluster_to_square_image(cluster)) for image in images: resized = image.resize((28, 28), Image.ANTIALIAS) vec = numpy.array(resized).ravel() vec = replace_negatives(vec) vec = scale_to_max_val(vec, max_val=255) image_vectors.append(vec) i += 1 return image_vectors
def get_clusters(good_urls, urls, n_urls=500, my_dbs=False, verbose=False): """ """ random.shuffle(good_urls) random.shuffle(urls) fit_urls = good_urls[:n_urls] + urls[:n_urls] mysekitei = sekitei(fit_urls, alpha=0.01) mysekitei.fit() X = mysekitei.most_freq_features() if my_dbs: py = dbscan().fit_predict(X) else: py = DBSCAN().fit_predict(X) regexpes = mysekitei.get_clusters_regexpes(X, py) print 'n_features=', mysekitei.n_features, '\n\n' for c,f,i in regexpes: print '---', c, '=', str(len(f)) print '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n' return mysekitei, regexpes
def populate_probes(probes): logging.debug('Populate Probes - Started.') for probe_id in probes: probe = probes[probe_id] shelves, noise = dbscan.dbscan(probe.products, 1, eps, dist, sort_key) probe.set_shelves(shelves) # logging.debug(str(map(lambda sh: map(lambda pr: pr.id, sh), shelves))) # logging.debug(str(map(lambda sh: map(lambda pr: pr.patch_url, sh), shelves))) probe.build_relations() # build matrices, not sure if it is necessary curr = probe.products n = len(curr) rights = np.zeros((n, n)) lefts = np.zeros((n, n)) for i in range(0, n): product = curr[i] for j in range(0, n): neighbour = curr[j].id rights[i][j] = int(neighbour in product.relations[rel_right]) lefts[i][j] = int(neighbour in product.relations[rel_left]) probe.set_rights(rights) probe.set_lefts(lefts) logging.debug('Populate Probes - Ended.')
def fextract(imgin): """ Args: imgin - image in """ # Initialize dbscan myDbscan = dbscan(_MYPARAMS["SIZE_OF_ROI"] / 2) PRINT_LOG_OUT = [] PRINT_LOG_OUT.append("[Date]") PRINT_LOG_OUT.append("Date Analyzed = " + strftime("%Y-%m-%d %H:%M:%S")) PRINT_LOG_OUT.append("\n[Analysis Parameters]") # print parameters PRINT_LOG_OUT += [str(k) + " = " + str(_MYPARAMS[k]) for k in _MYPARAMS.keys()] cv2.imwrite(os.path.join("Output", _MYPARAMS["IMAGE"]), imgin) height, width, channels = imgin.shape PRINT_LOG_OUT.append("Width = " + str(width)) PRINT_LOG_OUT.append("Height = " + str(height)) # Check if image imported correctly # if (imgin == None): # print "Image does not exist! Aborting...\n" # return; hsv_imgin = cv2.cvtColor(imgin, cv2.COLOR_BGR2HSV) # Detect image size [rows, columns] _IMBND = (hsv_imgin.shape[0], hsv_imgin.shape[1]) hsv_chans = cv2.split(hsv_imgin) # split image into HSV channels # Get both blurred and not blurred files for image processing if _MYPARAMS["HAS_BLUR"]: hsv_chans = [cv2.blur(hsvim, (_MYPARAMS["BKS"], _MYPARAMS["BKS"])) for hsvim in hsv_chans] # may use other feature detector for testing FD_TYPE = "MSER" PRINT_LOG_OUT.append("FD Type = " + FD_TYPE) print("Running MSER...") # delta, maxArea, minArea, maxVariation, minDiversity, maxEvolution, areaThreshold, minMargin, edgeBlurSize # Default is 5, 60, 14400, 0.25, 0.2, 200, 1.01, 0.003, 5 # Decreasing maxVariation increases how sharp edges need to be my_fd = cv2.MSER_create( 5, # _delta (int) int(_MYPARAMS["MIN_AREA"] / 2738 * _IMBND[0]), # _min_area (int) int(_MYPARAMS["MAX_AREA"] / 2738 * _IMBND[0]), # _max_area (int) 0.099, # _max_variation (float) 0.65, # _min_diversity (float) 200, # _max_evolution (int) 1.01, # _area_threshold (double) 0.003, # _min_margin (double) 5, # _edge_blur_size (int) ) # FD_TYPE = "SimpleBlob" # PRINT_LOG_OUT.append("FD Type: " + FD_TYPE) # my_fd = cv2.SimpleBlobDetector_create() imgClusteredRegions = copy.copy(imgin) PRINT_LOG_OUT.append("\n[Channel Keypoints]") kpts = [] # (k)ey(p)oin(t) out kptsSize = [] dkpsout = [] # (d)isplay (k)ey(p)oint (out)put for i, im in enumerate(hsv_chans): local_kpt = my_fd.detect(im, None) # local keypoints if len([x for x in _MYPARAMS["ACTIVE_CHANNEL"] if x == i]) > 0: # Outputs image of regions vis = im.copy() regions = my_fd.detectRegions(im, None) hulls = [cv2.convexHull(s.reshape(-1, 1, 2)) for s in regions] hullLocations, hullSizes = hulls2Points(hulls) cv2.polylines(vis, hulls, 1, (0, 255, 0)) # cv2.imwrite(os.path.join("Output", 'region visualization' + str(i) + '.jpg'), vis) for j, point in enumerate(hullLocations): kpts.append(point) kptsSize.append(hullSizes[j]) if local_kpt: # don't know how the third param works yet --> local_dpksout = cv2.drawKeypoints(im, local_kpt, im) dkpsout.append([local_dpksout]) # append to master list # cv2.imwrite(os.path.join("Output", 'dkpsout' + str(i) + '.jpg'), local_dpksout) # print out num of keypoints and other info PRINT_LOG_OUT.append("Channel " + str(i) + " = " + str(len(local_kpt))) # Crop out ROIs for active_channel clusters, clusterSizes = myDbscan.getClusters(kpts, kptsSize) averagedClusters = averageClusters(clusters) clusterSizes = largestSize(clusters, clusterSizes) # Tree filter print("Filtering trees...") if _MYPARAMS["USE_TREE_FILTER"]: averagedClusters, clusterSizes = filterTrees(imgin, averagedClusters, clusterSizes) imageName = _MYPARAMS["IMAGE"].split(".")[0] croppedImgNames = [] print("Cropping...") for i, mypoint in enumerate(averagedClusters): cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2 padding = _MYPARAMS["CROP_PADDING"] row_crop = ( clamp(mypoint[0] - cropSize - padding, 0, _IMBND[0]), clamp(mypoint[0] + cropSize + padding, 0, _IMBND[0]), ) col_crop = ( clamp(mypoint[1] - cropSize - padding, 0, _IMBND[1]), clamp(mypoint[1] + cropSize + padding, 0, _IMBND[1]), ) new_crop = imgin[row_crop[0] : row_crop[1], col_crop[0] : col_crop[1]] croppedImgNames.append(imageName + "roi" + str(i) + ".jpg") cv2.imwrite(os.path.join("Output", croppedImgNames[i]), new_crop) # Log clustering info PRINT_LOG_OUT.append("\n[Crop Info]") PRINT_LOG_OUT.append("Number of Crops = " + str(len(averagedClusters))) # Write log for the clusters for i, cluster in enumerate(averagedClusters): PRINT_LOG_OUT.append("\n[Crop " + str(i + 1) + "]") PRINT_LOG_OUT.append("Image Name = " + croppedImgNames[i]) PRINT_LOG_OUT.append("X = " + str(averagedClusters[i][1])) PRINT_LOG_OUT.append("Y = " + str(averagedClusters[i][0])) cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2 padding = _MYPARAMS["CROP_PADDING"] PRINT_LOG_OUT.append("Size = " + str(2 * (cropSize + padding))) # Output cluster locations # imgClusteredRegions = drawClusters(imgClusteredRegions, clusters, averagedClusters) imgClusteredRegions = drawCroppedRegions(imgClusteredRegions, averagedClusters, clusterSizes) cv2.imwrite(os.path.join("Output", "croppedRegions.jpg"), imgClusteredRegions) # print result info to log file with open(os.path.join("Output", imageName + " .ini"), "a") as f: for line in PRINT_LOG_OUT: f.write(line + "\n")
def fextract(imgin): """ Args: imgin - image in """ # Initialize dbscan myDbscan = dbscan(_MYPARAMS['SIZE_OF_ROI'] / 2) PRINT_LOG_OUT = [] PRINT_LOG_OUT.append("[Date]") PRINT_LOG_OUT.append("Date Analyzed = " + strftime("%Y-%m-%d %H:%M:%S")) PRINT_LOG_OUT.append("\n[Analysis Parameters]") # print parameters PRINT_LOG_OUT += [ str(k) + " = " + str(_MYPARAMS[k]) for k in _MYPARAMS.keys() ] cv2.imwrite(os.path.join("Output", _MYPARAMS['IMAGE']), imgin) height, width, channels = imgin.shape PRINT_LOG_OUT.append("Width = " + str(width)) PRINT_LOG_OUT.append("Height = " + str(height)) # Check if image imported correctly #if (imgin == None): # print "Image does not exist! Aborting...\n" # return; hsv_imgin = cv2.cvtColor(imgin, cv2.COLOR_BGR2HSV) # Detect image size [rows, columns] _IMBND = (hsv_imgin.shape[0], hsv_imgin.shape[1]) hsv_chans = cv2.split(hsv_imgin) # split image into HSV channels # Get both blurred and not blurred files for image processing if (_MYPARAMS['HAS_BLUR']): hsv_chans = [ cv2.blur(hsvim, (_MYPARAMS['BKS'], _MYPARAMS['BKS'])) for hsvim in hsv_chans ] # may use other feature detector for testing FD_TYPE = "MSER" PRINT_LOG_OUT.append("FD Type = " + FD_TYPE) print("Running MSER...") # delta, maxArea, minArea, maxVariation, minDiversity, maxEvolution, areaThreshold, minMargin, edgeBlurSize # Default is 5, 60, 14400, 0.25, 0.2, 200, 1.01, 0.003, 5 # Decreasing maxVariation increases how sharp edges need to be my_fd = cv2.MSER_create( 5, # _delta (int) int(_MYPARAMS['MIN_AREA'] / 2738 * _IMBND[0]), # _min_area (int) int(_MYPARAMS['MAX_AREA'] / 2738 * _IMBND[0]), # _max_area (int) 0.099, # _max_variation (float) 0.65, # _min_diversity (float) 200, # _max_evolution (int) 1.01, # _area_threshold (double) 0.003, # _min_margin (double) 5 # _edge_blur_size (int) ) # FD_TYPE = "SimpleBlob" # PRINT_LOG_OUT.append("FD Type: " + FD_TYPE) # my_fd = cv2.SimpleBlobDetector_create() imgClusteredRegions = copy.copy(imgin) PRINT_LOG_OUT.append("\n[Channel Keypoints]") kpts = [] # (k)ey(p)oin(t) out kptsSize = [] dkpsout = [] # (d)isplay (k)ey(p)oint (out)put for i, im in enumerate(hsv_chans): local_kpt = my_fd.detect(im, None) # local keypoints if len([x for x in _MYPARAMS['ACTIVE_CHANNEL'] if x == i]) > 0: # Outputs image of regions vis = im.copy() regions = my_fd.detectRegions(im, None) hulls = [cv2.convexHull(s.reshape(-1, 1, 2)) for s in regions] hullLocations, hullSizes = hulls2Points(hulls) cv2.polylines(vis, hulls, 1, (0, 255, 0)) #cv2.imwrite(os.path.join("Output", 'region visualization' + str(i) + '.jpg'), vis) for j, point in enumerate(hullLocations): kpts.append(point) kptsSize.append(hullSizes[j]) if (local_kpt): # don't know how the third param works yet --> local_dpksout = cv2.drawKeypoints(im, local_kpt, im) dkpsout.append([local_dpksout]) # append to master list #cv2.imwrite(os.path.join("Output", 'dkpsout' + str(i) + '.jpg'), local_dpksout) # print out num of keypoints and other info PRINT_LOG_OUT.append('Channel ' + str(i) + ' = ' + str(len(local_kpt))) # Crop out ROIs for active_channel clusters, clusterSizes = myDbscan.getClusters(kpts, kptsSize) averagedClusters = averageClusters(clusters) clusterSizes = largestSize(clusters, clusterSizes) # Tree filter print("Filtering trees...") if _MYPARAMS['USE_TREE_FILTER']: averagedClusters, clusterSizes = filterTrees(imgin, averagedClusters, clusterSizes) imageName = _MYPARAMS['IMAGE'].split('.')[0] croppedImgNames = [] print("Cropping...") for i, mypoint in enumerate(averagedClusters): cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][ 1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2 padding = _MYPARAMS['CROP_PADDING'] row_crop = (clamp(mypoint[0] - cropSize - padding, 0, _IMBND[0]), clamp(mypoint[0] + cropSize + padding, 0, _IMBND[0])) col_crop = (clamp(mypoint[1] - cropSize - padding, 0, _IMBND[1]), clamp(mypoint[1] + cropSize + padding, 0, _IMBND[1])) new_crop = imgin[row_crop[0]:row_crop[1], col_crop[0]:col_crop[1]] croppedImgNames.append(imageName + 'roi' + str(i) + '.jpg') cv2.imwrite(os.path.join("Output", croppedImgNames[i]), new_crop) # Log clustering info PRINT_LOG_OUT.append("\n[Crop Info]") PRINT_LOG_OUT.append("Number of Crops = " + str(len(averagedClusters))) # Write log for the clusters for i, cluster in enumerate(averagedClusters): PRINT_LOG_OUT.append("\n[Crop " + str(i + 1) + "]") PRINT_LOG_OUT.append("Image Name = " + croppedImgNames[i]) PRINT_LOG_OUT.append("X = " + str(averagedClusters[i][1])) PRINT_LOG_OUT.append("Y = " + str(averagedClusters[i][0])) cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][ 1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2 padding = _MYPARAMS['CROP_PADDING'] PRINT_LOG_OUT.append("Size = " + str(2 * (cropSize + padding))) # Output cluster locations #imgClusteredRegions = drawClusters(imgClusteredRegions, clusters, averagedClusters) imgClusteredRegions = drawCroppedRegions(imgClusteredRegions, averagedClusters, clusterSizes) cv2.imwrite(os.path.join("Output", 'croppedRegions.jpg'), imgClusteredRegions) # print result info to log file with open(os.path.join("Output", imageName + ' .ini'), 'a') as f: for line in PRINT_LOG_OUT: f.write(line + '\n')
def __division_rssi(self): # 总位置类数目 category_loc_num = len(self.__category_loc_data) category_loc_rssi = [None] * category_loc_num # print(category_loc_num) # print(category_loc_rssi) # 计算各个位置类中 各个数据对应的rssi类序号,每个位置类中rssi类序号都从0开始 for i in range(1, len(self.__category_loc_data)): # category_loc_rssi[i][j] 第i位置类中第j个样本的rssi类序号 category_loc_rssi[i] = dbscan(self.__category_loc_data[i], 5, 2, FingerprintSequence.distance_rssi) # print(category_loc_rssi[i]) self.__category_loc_rssi_data = [None] * category_loc_num # 统计每个位置类中 rssi类各个类别的数据数目 # category_loc_rssi_data_num[i][j] 第i个位置类中第j个rssi类中数据的数目 category_loc_rssi_data_num = [[] for i in range(category_loc_num)] for i in range(1, category_loc_num): # 第i个位置类中 所有rssi类数据数目初始化为0 category_loc_rssi_data_num[i] = [ 0 ] * int(np.max(category_loc_rssi[i]) + 1) for j in range(len(category_loc_rssi[i])): # 第i个位置类中 第j个数据对应的rssi类数目加1 category_loc_rssi_data_num[i][int( category_loc_rssi[i][j])] += 1 # print(category_loc_rssi_data_num) # 为每个位置类中的rssi类列表分配空间 for i in range(1, len(self.__category_loc_data)): # 第i位置类中rssi类列表初始化 self.__category_loc_rssi_data[i] = [ [] for i in range(len(category_loc_rssi_data_num[i])) ] # 第i位置类中第j个rssi类列表初始化 for j in range(1, len(self.__category_loc_rssi_data[i])): self.__category_loc_rssi_data[i][j] = [ [] for i in range(category_loc_rssi_data_num[i][j]) ] # print(self.__category_loc_rssi_data) # 开始向category_loc_rssi_data写入数据 # __category_loc_rssi_data[i][j][k] 表示第i个位置类中第j个rssi类中第k个数据 # count_loc_rssi[i][j]表示i位置类 j rssi类中已写入的数据数目 count_loc_rssi = [[0] * len(category_loc_rssi_data_num[i]) for i in range(category_loc_num)] # print(count_loc_rssi) # print() for i in range(1, category_loc_num): for j in range(len(category_loc_rssi[i])): ca_loc_rssi = int(category_loc_rssi[i][j]) # print(i, ' ', ca_loc_rssi) # print(len(self.__category_loc_rssi_data[i]), ' ', len(self.__category_loc_rssi_data[i][ca_loc_rssi])) if ca_loc_rssi > 0: self.__category_loc_rssi_data[i][ca_loc_rssi][count_loc_rssi[i][ca_loc_rssi]] = \ self.__category_loc_data[i][j] count_loc_rssi[i][ca_loc_rssi] += 1 print(count_loc_rssi) # 开始为每个位置rssi类建立对象 for i in range(1, category_loc_num): for j in range(1, len(category_loc_rssi_data_num[i])): # print(i, ' ', j) self.__categories.append( Category(self.__category_loc_rssi_data[i][j])) print(len(self.__categories)) for i in range(len(self.__categories)): for j in range(len(self.__categories[i].data)): print(self.__categories[i].data[j].uploader, ' ', self.__categories[i].data[j].feature_location, ' ', i) return True
##### ''' text_vectors = load('./text_vectors.p') top = {} for i in range(len(text_vectors)): score = [] for j in range(len(text_vectors)): if i != j: score.append((1-cosine_simailarity(text_vectors[i],text_vectors[j]))) score.sort() top[i] = score[:10] ''' #### text_vectors = load('./text_vectors.p') text_vectors = text_vectors[:1000] labels = dbscan(text_vectors, eps=.1, minPts=10) save('labels.p',labels) #get unique labels n_labels = set(labels) #create empty clusters clusters = {} for n in n_labels: if n == -1: continue else: clusters[n] = [] #assign each abstract to a vector based on the labels for i in range(len(labels)):
def c_cluster_points(points, num_points, points_dist): return cdbscan.dbscan(points, num_points, points_dist)
def getCWTPeaks(scaledCWT, Y, noiseEst, minSNR=3, minRow=3, minClust=4, EPS=None): ''' returns: N.array(peakLoc), cwtPeakLoc, cClass, boolean scaledCWT is the continuous wavelet transform provided by cwtMS function minRow is the first row of the cwt to pick peaks from--if this is too small you'll get too many peaks and the algorithm will choke. Keep in mind that the first few rows of the CWT are highly correlated with high frequency noise--you don't want them anyway. ''' cwtPeakLoc = [] print "Shape: ", scaledCWT.shape revRowArray = N.arange((scaledCWT.shape[0] - 1), 1, -1) #steps backwards for i in revRowArray: row = scaledCWT[i] if i > minRow: normRow = normalize(row) rowDeriv = derivative(normRow) t3 = time.clock() 'criterion 1 -- above the threshold and a zero crossing in the derivative' criterion = (rowDeriv < 0.5) & (rowDeriv > -0.5) & (normRow >= minNoise) tempLocEst = N.where(criterion)[0] for m in tempLocEst[: -1]: #need to exclude last element so we don't get an IndexError for the rowDeriv array if N.sign(rowDeriv[m]) > N.sign(rowDeriv[m + 1]): if normRow[m] >= noiseEst[m]: cwtPeakLoc.append([m, i]) print "Zero Crossing", time.clock() - t3 cwtPeakLoc = N.array(cwtPeakLoc) # ax2.plot(cwtPeakLoc[:,0], cwtPeakLoc[:,1], 'oy', alpha = 0.4) # print 'Peak Finding: ', time.clock() - t2 peakLoc = [] t3 = time.clock() try: cClass, tType, Eps, boolAns = dbscan(cwtPeakLoc, minClust, Eps=EPS) except: errorMsg = 'dbscan error...is your CWT huge?\n' errorMsg += "Sorry: %s\n\n%s\n" % (sys.exc_type, sys.exc_value) print errorMsg return None, None, None, False print 'Peak Cluster: ', time.clock() - t3 if boolAns: print cClass.max(), len(tType), Eps i = cClass.max() for m in xrange(int(i) + 1): ind = N.where(m == cClass) temp = cwtPeakLoc[ind] # ax2.plot(temp[:,0],temp[:,1],'-s', alpha = 0.7, ms = 3) if len(temp) > 0: sortInd = temp[:, 0].argsort() temp = temp[sortInd] tempDiffX = N.diff(temp[:, 0]) tempDiffY = N.diff(temp[:, 1]) diffSumX = tempDiffX.sum() diffSumY = tempDiffY.sum() print tempDiffX, diffSumX print tempDiffY, diffSumY # if diffSumX <= len(tempDiffX)*2: i = 0 rowThresh = 3 pntPad = 50 staticCut = 0.2 for j in tempDiffY: if j <= rowThresh: i += 1 else: i += -1 # if i >= rowThresh: if tempDiffY.mean() <= rowThresh: maxInd = temp[:, 1].argmin() xVal = temp[maxInd][0] #this screening assumes there is a low value to the first #scale value e.g. 1 or 2 tempVals = Y[(xVal - pntPad):xVal] if len(tempVals) > 0: localMaxInd = tempVals.argmax() # print localMaxInd yMaxInd = xVal - pntPad + localMaxInd # if Y[xVal] >= noiseEst[xVal]: if Y[yMaxInd] >= noiseEst[yMaxInd] * minSNR / 2 and Y[ yMaxInd] >= staticCut: # if Y[xVal]>=scaledCWT[0][xVal]*minSNR/2 and Y[xVal] >= noiseEst[xVal]*minSNR/2: peakLoc.append(x[yMaxInd]) print "Appended, %s\n" % x[yMaxInd] else: print "too low @ %s\n" % x[xVal] else: print "\n" # print x[xVal] return N.array(peakLoc), cwtPeakLoc, cClass, True
from sklearn.preprocessing import StandardScaler ############################################################################## # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) X = StandardScaler().fit_transform(X) X = X.astype(np.float32) ############################################################################## # Compute DBSCAN import dbscan labels = np.array(dbscan.dbscan(X, "sparse").run(0.3, 10)) core_samples_mask = np.zeros_like(labels, dtype=bool) # core_samples_mask[db.core_sample_indices_] = True # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) ############################################################################## # Plot result import matplotlib.pyplot as plt # Black removed and is used for noise instead. unique_labels = set(labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
evaluateClassification(yClass_test, yClass_knn, 'K-nearest neighbors', displayDetailedView=1) plotDecisionBoundry(X_test, yClass_test, yClass_knn, 'K-nearest neighbors') evaluateRegression(yReg_test, yReg_lr, 'Linear Regression') evaluateRegression(yReg_test, yReg_nr, 'KNN Regression') X, yClass, yReg = processData(trainTestSplit=2) classes = ['German Shepherd', 'Daschhund', 'Samoyed', 'Siberian Husky'] y_kmeans = kmeans(X, init='random') evaluateClustering(X, yClass.values.ravel(), y_kmeans, 'K-means Clustering', classes) y_dbscan = dbscan(X) evaluateClustering(X, yClass.values.ravel(), y_dbscan, 'DBSCAN Clustering', classes) y_agglomerative = agglomerative(X) evaluateClustering(X, yClass.values.ravel(), y_agglomerative, 'Agglomerative Clustering', classes) vizualizeData(X_lr, yClass_lr.ravel(), X_rf, yClass_rf.ravel(), X_knn, yClass_knn.ravel(), X_linReg, yReg_lr.ravel(), X_knnReg, yReg_nr.ravel(), y_kmeans, y_dbscan, y_agglomerative) print('\nFINAL VERDICT:') print('\nAccuracy(lr, rf, knn): ', lrAc, rfAc, knnAc) print('\nScore(lr, nr): ', lrSc, nrSc)
usagearrays[i, j] = usagearrays[i, j] - avg matrice = cosine_similarity(usagearrays) matrice = (1. - matrice) / 2. return matrice ################################################################################## ratings = pd.read_csv('ua.base', sep='\t', names=['user', 'movie', 'rating', 'time']) usagematrix = ratings.pivot_table(index='user', columns='movie', values='rating') usagematrix = usagematrix.apply( lambda usagematrix: usagematrix.fillna(usagematrix.mean()), axis=1) matrice = creatMatrice(usagematrix.values) dbscan_instance = dbscan(matrice, 0.4, 20, ccore=False, data_type='distance_matrix') dbscan_instance.process() clusters = dbscan_instance.get_clusters() noise = dbscan_instance.get_noise() print(len(clusters)) print(len(noise)) MAE_RMSE(ratings, clusters, 'ua.test')
def main(): heading = 0 latitude = "" longitude = "" altitude = "" with open(_MYPARAMS['GPS_LOG'], 'rb') as gpsfile: spamreader = csv.reader(gpsfile, delimiter=',', quotechar='|') for row in spamreader: found = False count = 0 for name in row: count = count + 1 if name == os.path.basename(_MYPARAMS['IMAGE']): found = True if found and count == 5: heading = float(name) if found and count == 2: latitude = name if found and count == 3: longitude = name if found and count == 4: altitude = name headingString = headingToString(heading) # Initialize dbscan myDbscan = dbscan(_MYPARAMS['SIZE_OF_ROI'] / 2) PRINT_LOG_OUT = [] PRINT_LOG_OUT.append("[Date]") PRINT_LOG_OUT.append("Date Analyzed = " + strftime("%Y-%m-%d %H:%M:%S")) PRINT_LOG_OUT.append( "\n[Position]") # For some reason this code went missing in a commit PRINT_LOG_OUT += ["heading = " + headingString] PRINT_LOG_OUT += ["headingDegrees = " + str(heading)] PRINT_LOG_OUT += ["latitude = " + latitude] PRINT_LOG_OUT += ["longitude = " + longitude] PRINT_LOG_OUT += ["altitude = " + altitude] PRINT_LOG_OUT.append("\n[Analysis Parameters]") # print parameters PRINT_LOG_OUT += [ str(k) + " = " + str(_MYPARAMS[k]) for k in _MYPARAMS.keys() ] # output folder #fileList = os.listdir(_MYPARAMS['OUTPUT_FOLDER']) #for fileName in fileList: # os.remove(_MYPARAMS['OUTPUT_FOLDER'] + "/"+fileName) # import image from file print("Loading Image...") imgin = cv2.imread(_MYPARAMS['IMAGE'], cv2.IMREAD_COLOR) cv2.imwrite( os.path.join(_MYPARAMS['OUTPUT_FOLDER'], os.path.basename(_MYPARAMS['IMAGE'])), compressImage(imgin)) height, width, channels = imgin.shape PRINT_LOG_OUT.append("Width = " + str(width)) PRINT_LOG_OUT.append("Height = " + str(height)) # Check if image imported correctly #if (imgin == None): # print "Image does not exist! Aborting...\n" # return; hsv_imgin = cv2.cvtColor(imgin, cv2.COLOR_BGR2HSV) # Detect image size [rows, columns] _IMBND = (hsv_imgin.shape[0], hsv_imgin.shape[1]) hsv_chans = cv2.split(hsv_imgin) # split image into HSV channels # Get both blurred and not blurred files for image processing if (_MYPARAMS['HAS_BLUR']): hsv_chans = [ cv2.blur(hsvim, (_MYPARAMS['BKS'], _MYPARAMS['BKS'])) for hsvim in hsv_chans ] # may use other feature detector for testing FD_TYPE = "MSER" PRINT_LOG_OUT.append("FD Type = " + FD_TYPE) print("Running MSER...") # delta, maxArea, minArea, maxVariation, minDiversity, maxEvolution, areaThreshold, minMargin, edgeBlurSize # Decreasing maxVariation increases how sharp edges need to be my_fd = cv2.MSER_create( 5, _MYPARAMS['MIN_AREA'] / 2738 * _IMBND[0], _MYPARAMS['MAX_AREA'] / 2738 * _IMBND[0], 0.099, 0.65, 200, 1.01, 0.003, 5) # Default is 5, 60, 14400, 0.25, 0.2, 200, 1.01, 0.003, 5 # FD_TYPE = "SimpleBlob" # PRINT_LOG_OUT.append("FD Type: " + FD_TYPE) # my_fd = cv2.SimpleBlobDetector_create() imgClusteredRegions = copy.copy(imgin) PRINT_LOG_OUT.append("\n[Channel Keypoints]") kpts = [] # (k)ey(p)oin(t) out kptsSize = [] dkpsout = [] # (d)isplay (k)ey(p)oint (out)put for i, im in enumerate(hsv_chans): local_kpt = my_fd.detect(im, None) # local keypoints if len([x for x in _MYPARAMS['ACTIVE_CHANNEL'] if x == i]) > 0: # Outputs image of regions vis = im.copy() regions = my_fd.detectRegions(im, None) hulls = [cv2.convexHull(s.reshape(-1, 1, 2)) for s in regions] hullLocations, hullSizes = hulls2Points(hulls) cv2.polylines(vis, hulls, 1, (0, 255, 0)) #cv2.imwrite(os.path.join(_MYPARAMS['OUTPUT_FOLDER'], 'region visualization' + str(i) + '.jpg'), vis) for j, point in enumerate(hullLocations): kpts.append(point) kptsSize.append(hullSizes[j]) if (local_kpt): # don't know how the third param works yet --> local_dpksout = cv2.drawKeypoints(im, local_kpt, im) dkpsout.append([local_dpksout]) # append to master list #cv2.imwrite(os.path.join(_MYPARAMS['OUTPUT_FOLDER'], 'dkpsout' + str(i) + '.jpg'), local_dpksout) # print out num of keypoints and other info PRINT_LOG_OUT.append('Channel ' + str(i) + ' = ' + str(len(local_kpt))) # Crop out ROIs for active_channel clusters, clusterSizes = myDbscan.getClusters(kpts, kptsSize) averagedClusters = averageClusters(clusters) clusterSizes = largestSize(clusters, clusterSizes) # Tree filter print("Filtering trees...") if _MYPARAMS['USE_TREE_FILTER']: averagedClusters, clusterSizes = filterTrees(imgin, averagedClusters, clusterSizes) imageName = os.path.basename(_MYPARAMS['IMAGE']).split('.')[0] croppedImgNames = [] print("Cropping...") for i, mypoint in enumerate(averagedClusters): cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][ 1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2 padding = _MYPARAMS['CROP_PADDING'] row_crop = (clamp(mypoint[0] - cropSize - padding, 0, _IMBND[0]), clamp(mypoint[0] + cropSize + padding, 0, _IMBND[0])) col_crop = (clamp(mypoint[1] - cropSize - padding, 0, _IMBND[1]), clamp(mypoint[1] + cropSize + padding, 0, _IMBND[1])) new_crop = imgin[row_crop[0]:row_crop[1], col_crop[0]:col_crop[1]] croppedImgNames.append(imageName + 'roi' + str(i) + '.jpg') cv2.imwrite( os.path.join(_MYPARAMS['OUTPUT_FOLDER'], croppedImgNames[i]), new_crop) # Log clustering info PRINT_LOG_OUT.append("\n[Crop Info]") PRINT_LOG_OUT.append("Number of Crops = " + str(len(averagedClusters))) # Write log for the clusters for i, cluster in enumerate(averagedClusters): PRINT_LOG_OUT.append("\n[Crop " + str(i + 1) + "]") PRINT_LOG_OUT.append("Image Name = " + croppedImgNames[i]) PRINT_LOG_OUT.append("X = " + str(averagedClusters[i][1])) PRINT_LOG_OUT.append("Y = " + str(averagedClusters[i][0])) cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][ 1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2 padding = _MYPARAMS['CROP_PADDING'] PRINT_LOG_OUT.append("Size = " + str(2 * (cropSize + padding))) # Output cluster locations #imgClusteredRegions = drawClusters(imgClusteredRegions, clusters, averagedClusters) imgClusteredRegions = drawCroppedRegions(imgClusteredRegions, averagedClusters, clusterSizes) cv2.imwrite(os.path.join(_MYPARAMS['OUTPUT_FOLDER'], 'croppedRegions.jpg'), imgClusteredRegions) # print result info to log file with open( os.path.join(_MYPARAMS['OUTPUT_FOLDER'], imageName + ' Results.ini'), 'a') as f: for line in PRINT_LOG_OUT: f.write(line + '\n')
def main(): # process command line arguments and return arguments as args args = run_parser() # load or build datasets according to arguments datasets = ready_datasets(args) # build experiment object, including datasets exp = experiment(datasets, settings.algorithms) # calculate distances for each dataset calculate_distances(exp) # run an experiment with all algorithms and datasets if args.experiment: run_experiment(exp) # run k-means algorithm on specified datasets if args.kmeans: clusters = run_kbrain(settings.k[0], "k-means", exp.datasets[0]) exp.results["k-means"].append( (exp.datasets[0].name, settings.maxSamples, 1, settings.k[0], clusters)) if args.kmedoids: clusters = run_kbrain(settings.k[0], "k-medoids", exp.datasets[0]) exp.results["k-medoids"].append( (exp.datasets[0].name, settings.maxSamples, 1, settings.k[0], clusters)) if args.dbscan: # call dbscan wrapper function results = dbscan( exp.datasets[0], settings.maxSamples, settings.epsilons[0], settings.minPts[0], ) # save results of each experiment exp.results["DBSCAN"].append(( exp.datasets[0].name, settings.maxSamples, 1, settings.epsilons[0], settings.minPts[0], results, )) # compile results into a dataframe resultsDF = compile_results(exp) # print(resultsDF.drop(columns=["cluster_list"])) # calculate accuracy of our clustering algorithms' results # compared to sklearn.dataset dataset labels calculate_groundtruth_accuracy(resultsDF, exp) # calculate accuracy of our clustering algorithms' results # compared to sklearn clustering algorithm labels # calculate_sklearn_accuracy(resultsDF, exp) print(resultsDF.drop(columns=["cluster_list", "dataset"])) save_results(resultsDF)
def run_experiment(exp): # loop through each clustering algorithm for algo in exp.algorithms: # loop through each dataset for ds in exp.datasets: # loop through the number of datapoints # to be used for num in settings.numSamples: # loop for each trial run for i in range(1, settings.numRuns + 1): print( "algo: {0}, ds: {1}, size: {2}".format( algo, ds.name, num), end="", ) startTime = time.perf_counter() # call dbscan with parameters if algo == "DBSCAN": # loop parameters unique to dbscan for eps in settings.epsilons: for mp in settings.minPts: # call dbscan with parameters results = dbscan(ds, num, eps, mp) # save results of each experiment exp.results[algo].append( (ds.name, num, i, eps, mp, results)) if algo == "k-means": for k in range(3, 5): clusters = run_kbrain(k, algo, ds) exp.results[algo].append( (ds.name, num, i, k, clusters)) if algo == "k-medoids": for k in range(3, 5): clusters = run_kbrain(k, algo, ds) exp.results[algo].append( (ds.name, num, i, k, clusters)) if algo == "sklearn_kmeans": for numClusters in range(3, 5): results = sklearn_kmeans(ds, numClusters, num) exp.results[algo].append( (ds.name, num, i, numClusters, results)) if algo == "sklearn_kmedoids": for numClusters in range(3, 5): results = sklearn_kmedoids(ds, numClusters, num) exp.results[algo].append( (ds.name, num, i, numClusters, results)) if algo == "sklearn_dbscan": # loop parameters unique to dbscan for eps in settings.epsilons: for mp in settings.minPts: # call dbscan with parameters results = sklearn_dbscan(ds, num, eps, mp) # save results of each experiment exp.results[algo].append( (ds.name, num, i, eps, mp, results)) stopTime = time.perf_counter() print(" {0:3.2} minutes".format( (stopTime - startTime) / 60))
def getCWTPeaks(scaledCWT, Y, noiseEst, minSNR = 3, minRow = 3, minClust = 4, EPS = None): ''' returns: N.array(peakLoc), cwtPeakLoc, cClass, boolean scaledCWT is the continuous wavelet transform provided by cwtMS function minRow is the first row of the cwt to pick peaks from--if this is too small you'll get too many peaks and the algorithm will choke. Keep in mind that the first few rows of the CWT are highly correlated with high frequency noise--you don't want them anyway. ''' cwtPeakLoc = [] print "Shape: ",scaledCWT.shape revRowArray = N.arange((scaledCWT.shape[0]-1),1,-1)#steps backwards for i in revRowArray: row = scaledCWT[i] if i> minRow: normRow = normalize(row) rowDeriv = derivative(normRow) t3 = time.clock() 'criterion 1 -- above the threshold and a zero crossing in the derivative' criterion = (rowDeriv < 0.5) & (rowDeriv > -0.5) & (normRow >= minNoise) tempLocEst = N.where(criterion)[0] for m in tempLocEst[:-1]:#need to exclude last element so we don't get an IndexError for the rowDeriv array if N.sign(rowDeriv[m]) > N.sign(rowDeriv[m+1]): if normRow[m] >= noiseEst[m]: cwtPeakLoc.append([m,i]) print "Zero Crossing", time.clock()-t3 cwtPeakLoc = N.array(cwtPeakLoc) # ax2.plot(cwtPeakLoc[:,0], cwtPeakLoc[:,1], 'oy', alpha = 0.4) # print 'Peak Finding: ', time.clock() - t2 peakLoc = [] t3 = time.clock() try: cClass, tType, Eps, boolAns = dbscan(cwtPeakLoc, minClust, Eps = EPS) except: errorMsg ='dbscan error...is your CWT huge?\n' errorMsg += "Sorry: %s\n\n%s\n"%(sys.exc_type, sys.exc_value) print errorMsg return None, None, None, False print 'Peak Cluster: ', time.clock() - t3 if boolAns: print cClass.max(), len(tType), Eps i = cClass.max() for m in xrange(int(i)+1): ind = N.where(m == cClass) temp = cwtPeakLoc[ind] # ax2.plot(temp[:,0],temp[:,1],'-s', alpha = 0.7, ms = 3) if len(temp) > 0: sortInd = temp[:,0].argsort() temp = temp[sortInd] tempDiffX = N.diff(temp[:,0]) tempDiffY = N.diff(temp[:,1]) diffSumX = tempDiffX.sum() diffSumY = tempDiffY.sum() print tempDiffX, diffSumX print tempDiffY, diffSumY # if diffSumX <= len(tempDiffX)*2: i = 0 rowThresh = 3 pntPad = 50 staticCut = 0.2 for j in tempDiffY: if j <= rowThresh: i+=1 else: i+=-1 # if i >= rowThresh: if tempDiffY.mean() <= rowThresh: maxInd = temp[:,1].argmin() xVal = temp[maxInd][0] #this screening assumes there is a low value to the first #scale value e.g. 1 or 2 tempVals = Y[(xVal-pntPad):xVal] if len(tempVals)>0: localMaxInd = tempVals.argmax() # print localMaxInd yMaxInd = xVal-pntPad+localMaxInd # if Y[xVal] >= noiseEst[xVal]: if Y[yMaxInd] >= noiseEst[yMaxInd]*minSNR/2 and Y[yMaxInd] >= staticCut: # if Y[xVal]>=scaledCWT[0][xVal]*minSNR/2 and Y[xVal] >= noiseEst[xVal]*minSNR/2: peakLoc.append(x[yMaxInd]) print "Appended, %s\n"%x[yMaxInd] else: print "too low @ %s\n"%x[xVal] else: print "\n" # print x[xVal] return N.array(peakLoc), cwtPeakLoc, cClass, True
maxVal = max(maxVal, z) inputs = rescaleSet(data, minVal, maxVal, 0, 1) # Scale input data (coh, sep) = evalCluster(inputs) return (coh, sep) if __name__ == '__main__': variables = (sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5]) # Type DataSet numParam iterations other data = getInputData('DataSets/' + str(variables[1])) print("DataSet:", str(variables[1])) print("\nInputs:\n" + str(data) + '\n') if variables[0] == 'K': clusters = kmeans.kmeans(data, int(variables[2]), int(variables[3])) elif variables[0] == 'D': clusters = dbscan.dbscan(data, 15) elif variables[0] == 'C': clusters = compLearn.compLearn(data, int(variables[2]), int(variables[3]), float(variables[4])) elif variables[0] == 'A': clusters = ACO.ACO(data, int(variables[2]), int(variables[3])) elif variables[0] == 'P': clusters = PSO.PSO(data, int(variables[2]), int(variables[3])) else: print('Unknown function specified...') sys.exit(1) print("\nClusters:\n" + str(clusters)) (coh, sep) = main(clusters) print("\nNumClusters:", len(clusters)) print("\nNumPerCluster:", [len(x) for x in clusters]) print("\nCohesion:", coh) print("\nSeperation: ", sep)
def _create_dbscan(self, sample_data, *args): return dbscan.dbscan(sample_data, *args)
figsize=(8, 12), sharey=True) for i, dataset in enumerate(list_dataset): print("Dataset: {}".format(dataset)) for j, model in enumerate(models): np.random.seed(31) if j == 0: axes[i, j].set_ylabel(dataset) if dataset == "blobs": if model == "K Means": mod = kmeans.kmeans(ncluster=3, initialization='kmeans++') elif model == "GaussianMM": mod = gaussianmm.gaussianmm(ncluster=3, initialization='kmeans++') elif model == "DBSCAN": mod = dbscan.dbscan(minpts=5, epsilon=0.18) elif dataset == "varied_blobs1": if model == "K Means": mod = kmeans.kmeans(ncluster=3, initialization='kmeans++') elif model == "GaussianMM": mod = gaussianmm.gaussianmm(ncluster=3, initialization='kmeans++') elif model == "DBSCAN": mod = dbscan.dbscan(minpts=5, epsilon=0.18) elif dataset == "varied_blobs2": if model == "K Means": mod = kmeans.kmeans(ncluster=3, initialization='kmeans++') elif model == "GaussianMM": mod = gaussianmm.gaussianmm(ncluster=3, initialization='kmeans++') elif model == "DBSCAN":
def _dbscan_metrics_comparison(metrics): dbscan.dbscan(metrics, 1000, 2)
copy.deepcopy(max(dataset, key=lambda d: d.tuple[i])) for i in xrange(0, cls_idx) ] f_norm = open("normData.txt", 'w') for d in dataset: for i in xrange(0, cls_idx): d.tuple[i] = float(d.tuple[i] - mins[i].tuple[i]) / (maxs[i].tuple[i] - mins[i].tuple[i]) f_norm.write(str(d.tuple)) f_norm.write('\n') k = int(sys.argv[1]) eps = float(sys.argv[2]) cluster = dbscan.dbscan(dataset, eps, k) if len(cluster) == 0: print 'k:', k, 'no. of cluster:', len(cluster) print pure = evaluation.purity(cluster, len(dataset)) NMI = evaluation.NMI(cluster, dataset) RI = evaluation.RI(cluster, dataset) cp = [len(c) for c in cluster] f_out = open("output.txt", 'w') for i in xrange(0, len(cluster)): print 'cluster:', i, 'no. of pt. in cluster:', cp[i] for c in cluster[i]:
data = f.read() data = data.split("\n") data = data[1:] data = [ i.split(" ") for i in data ] # x data u_raw = [ i[2] for i in filter(lambda x: len(x) > 4, data) ] # y data v_raw = [ i[3] for i in filter(lambda x: len(x) > 4, data) ] # set data datax = np.array(v_raw, dtype=np.float64) datay = np.array(u_raw, dtype=np.float64) xmin = np.min(datax) xmax = np.max(datax) ymin = np.min(datay) ymax = np.max(datay) minPoints = 4 epsilon = 40.0 dbs = dbscan(datax, datay, minPoints, epsilon) dbs.plot(xmin, xmax, ymin, ymax)
# (1) generate data nsample = 32000 case = "varied_blobs1" X = create_dataset_sklearn.create_dataset(nsample,case) array_ndim = np.array([500, 1000, 2000, 4000, 8000, 16000, 32000]) array_time = np.zeros((np.size(array_ndim))) # (2) generate time data nrun = 2 for idx in range(np.size(array_ndim)): for _ in range(nrun): # (2) create model minpt = 5 epsilon = 0.18 model = dbscan.dbscan(minpt,epsilon,animation=False) # (3) fit model ndim = array_ndim[idx] model.fit(X[:,0:ndim]) array_time[idx] += model.time_fit print("Dimension: {} Time Fit: {}".format(ndim,array_time[idx])) # determine power log_ndim = np.log(array_ndim) log_time = np.log(array_time) coeff = np.polyfit(log_ndim,log_time,1) p = np.poly1d(coeff) plogndim = p(log_ndim) print("Power: {}".format(coeff[0])) plt.figure() plt.plot(log_ndim,log_time,"ro",label="Data") plt.plot(log_ndim,plogndim,"b-",label="Fit")
__author__ = 'arno' import dbscan from numpy import ndarray from numpy import array #some dummy data dummydaten = array([[1, 1.5], [1, 1.2], [0.9, 1.2], [8.2, 1.0], [8.3, 0.7], [9.2, 0.7], [-3.3, 5]]) #put in the data we want to use minNeighbors = 1 epsilon = 0.85 data = dummydaten #use dbscan dbscan.dbscan(data, epsilon, minNeighbors)
import plot_data # (1) load data iris = data_iris.iris() X, class_label = iris.load() # perform pca and reduce dimension to 2 model_pca = pca.pca() model_pca.fit(X) R = model_pca.data_reduced_dimension(reduced_dim=2) plot_data.plot_scatter_class(R, class_label, "Iris Data Projected to 2 Dimensions using PCA", "u0", "u1") # (2) create model minpts = 4 epsilon = 0.4 model = dbscan.dbscan(minpts, epsilon) # (3) fit model model.fit(R) print("Time fit: {}".format(model.time_fit)) # (4) results level = -1 print("Purity: {}".format(metrics.purity(model.clustersave[level], class_label))) print("Davies-Bouldin: {}".format( metrics.davies_bouldin(R, model.clustersave[level]))) print("Silhouette: {}".format(metrics.silhouette(R, model.clustersave[level]))) model.plot_cluster(nlevel=level, title="DBSCAN Clustering for Iris Dataset reduced to 2d", xlabel="u0", ylabel="u1") metrics.plot_cluster_distribution(model.clustersave[level], class_label)
from dbscan import dbscan # Create three gaussian blobs to use as our clustering data. centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) X = StandardScaler().fit_transform(X) ############################################################################### # My implementation of DBSCAN # # Run my DBSCAN implementation. print('Running my implementation...') my_labels = dbscan(X, eps=0.3, MinPts=10) ############################################################################### # Scikit-learn implementation of DBSCAN # print('Runing scikit-learn implementation...') db = DBSCAN(eps=0.3, min_samples=10).fit(X) skl_labels = db.labels_ # Scikit learn uses -1 to for NOISE, and starts cluster labeling at 0. I start # numbering at 1, so increment the skl cluster numbers by 1. for i in range(0, len(skl_labels)): if not skl_labels[i] == -1: skl_labels[i] += 1
for d in dataset: for i in xrange(0, cls_idx - 1): d.tuple[i] = float(d.tuple[i] - mins[i].tuple[i]) / (maxs[i].tuple[i] - mins[i].tuple[i]) # print all combination of k and sd for k in xrange(2, 5): for sd_away in xrange(0, 5): try: kdist = dbscan.k_distance(dataset, k, sd_away) except: print 'kdist anchor out of range, skipped' print continue cluster = dbscan.dbscan(dataset, kdist, k) if len(cluster) == 0: print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len( cluster) print continue cc = [measure.cls_err(c) for c in cluster] errs = [] clss = [] for err, cls in cc: errs.append(err) clss.append(cls) cp = [len(c) for c in cluster] print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(
import dbscan import sys input_file = sys.argv[1] cluster_num = int(sys.argv[2]) epsilon = int(sys.argv[3]) minpts = int(sys.argv[4]) point_list = dbscan.parser(input_file) dbscan.get_near_point(point_list, epsilon, minpts) mark_list = dbscan.dbscan(point_list, cluster_num, epsilon, minpts) dbscan.post_clustering(cluster_num, mark_list, point_list) for i in range(0, cluster_num): with open(input_file.split('.')[0] + '_cluster_' + str(i) + '.txt', 'w') as writefile: writefile.seek(0) for index, num in enumerate(mark_list): if num is not None and num == i: writefile.write(str(index) + '\n')
import dbscan dbscan.dbscan( [(1,7), (3,4), (7, 8), (4, 5), (1, 1), (2, 2)], 5, 7.5 )
for d in dataset: for i in xrange(0, cls_idx-1): d.tuple[i] = float(d.tuple[i] - mins[i].tuple[i]) / (maxs[i].tuple[i] - mins[i].tuple[i]) # print all combination of k and sd for k in xrange(2, 5): for sd_away in xrange(0, 5): try: kdist = dbscan.k_distance(dataset, k, sd_away) except: print 'kdist anchor out of range, skipped' print continue cluster = dbscan.dbscan(dataset, kdist, k) if len(cluster) == 0: print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(cluster) print continue cc = [measure.cls_err(c) for c in cluster] errs = [] clss = [] for err, cls in cc: errs.append(err) clss.append(cls) cp = [len(c) for c in cluster] print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(cluster) for i in xrange(0, len(cluster)):
reload(hp) test_data = np.mat(hp.loadDataSet(file_name)) if plot_origin == 1: test_data_show = hp.degree2radian(test_data, 0) plt.createOrigin(test_data_show.A) else: # get and transfer Data test_data_radian = hp.degree2radian(test_data, -1) test_data_vector = hp.orientation2vector(test_data_radian) # run dbscan Algorithm print("******************* start dbscan ********************") print("") cluster_result, noise_result, k = dbs.dbscan(test_data_vector, eps, min_pts) # transfer result result_data = np.mat(np.zeros((cluster_result.shape[0], 3))) result_data[:, 0:2] = hp.vector2orientation(cluster_result[:, 0:3]) result_data[:, -1] = cluster_result[:, -1] result_data = hp.degree2radian(result_data, 0) noise_data = hp.vector2orientation(noise_result[:, 0:3]) noise_data = hp.degree2radian(noise_data, 0) # print ang plot result print("****************** cluster result ******************") print("") print(result_data) print("") print("****************** cluster result ******************")