def dbscanV2( display ):
    pixarr = get_display_matrix(display)
    arr = numpy.array(pixarr)
    pt_locations = numpy.where( arr == 0)
    points = []
    for i in range(len(pt_locations[0])):
        points.append( (pt_locations[0][i], pt_locations[1][i]) )
    cdbscan.dbscan( points, 5, 10 )
Ejemplo n.º 2
0
def dbscanV2(display):
    pixarr = get_display_matrix(display)
    arr = numpy.array(pixarr)
    pt_locations = numpy.where(arr == 0)
    points = []
    for i in range(len(pt_locations[0])):
        points.append((pt_locations[0][i], pt_locations[1][i]))
    cdbscan.dbscan(points, 5, 10)
def cluster_stay_points(all_spts):
    print('Clustering preprocessing has been started')
    all_lats = []
    all_longs = []
    all_cords = []
    for point in all_spts:
        all_lats.append(point[0])
        all_longs.append(point[1])
    all_cords.append(all_lats)
    all_cords.append(all_longs)
    dbscan(all_cords, 200, 5)
Ejemplo n.º 4
0
def use_dbscan(x_pca, y):
    data = x2nodes(x_pca)
    dbscan(data, eps=3.1, min_points=6)

    y_pred = []
    for i in data:
        y_pred.append(i.label)

    score = get_score(y, y_pred)
    print(score)
    title = 'dbscan-mnist-socre-' + str(score)
    db_show(data, title)
Ejemplo n.º 5
0
def dbscan_clustering(filename, epsilon, min_pts):
    sample = read_data(filename)
    dbscan_instance = dbscan(sample, epsilon, min_pts)
    dbscan_instance.process()
    clusters = dbscan_instance.get_clusters()
    outliers = dbscan_instance.get_outliers()
    print("Clusters :\n", clusters)
    print("Outliers :\n", outliers)
Ejemplo n.º 6
0
def data():
    resp = "{}"
    headers = dict()
    last = 0
    sensors = [fetch.SENSOR_1_NAME, fetch.SENSOR_2_NAME]
    for fn in sensors:
        lu = fetch.lastupdate(fn)
        if not lu:
            continue
        if not last:
            last = lu
        else:
            last = max(last, lu)
    lm = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime(last))
    headers['Last-Modified'] = lm
    ims = bottle.request.environ.get('HTTP_IF_MODIFIED_SINCE')
    if ims:
        ims = bottle.parse_date(ims.split(";")[0].strip())
    if ims is not None and ims >= last:
        headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT",
                                        time.gmtime())
        return bottle.HTTPResponse(status=304, **headers)
    resp = {}
    # ensure we match readouts
    for fn in sensors:
        data = fetch.rrdfetch(fn)
        if "time" not in resp:
            resp["time"] = data["time"]
        resp[fn] = data
    # does time diverge?
    time1 = set(resp[sensors[0]]["time"])
    time2 = set(resp[sensors[1]]["time"])
    for fn in sensors:
        sensordata = resp[fn]
        sensordata.pop("time")
        # Remove outliers
        for name in sensordata.keys():
            data = sensordata[name]
            if name == "volt":
                eps = 0.01
            else:
                eps = 2
            outliers = dbscan(data, eps, 3)
            for i in outliers:
                data[i] = None

    # todo: time might diverge +/- one tick
    #if time1 != time2:
    #    print(time1-time2, time2-time1)
    #    return bottle.HTTPResponse(status=500)

    resp = json.dumps(resp).encode("utf-8")
    headers['Content-Length'] = len(resp)
    return bottle.HTTPResponse(resp, **headers)
Ejemplo n.º 7
0
def dbscanV2( display, eps, threshold_num ):
    pixarr = get_display_matrix(display)
    arr = numpy.array(pixarr)
    pt_locations = numpy.where( arr == 0)
    points = []
    for i in range(len(pt_locations[0])):
        points.append( (pt_locations[0][i], pt_locations[1][i]))
    clusters = cdbscan.dbscan( points, threshold_num, eps )
    #    for cluster in clusters:
    #        print cluster
    return clusters
Ejemplo n.º 8
0
def streamProc(folderName):
    mkdirSafe(join(folderName, "snapshot"))
    mkdirSafe(join(folderName, "clusters"))
    mkdirSafe(join(folderName, "pattern"))
    mkdirSafe(join(folderName, "img"))

    snapshotFolder = join(folderName, "snapshot")
    clusterFolder = join(folderName, "clusters")
    patternFolder = join(folderName, "pattern")
    imgFolder = join(folderName, "img")

    length = 150

    interpolate.takeSnapshot(folderName, snapshotFolder)
    dbscan.dbscan(snapshotFolder, clusterFolder)
    intersect.Intersect(clusterFolder, patternFolder, frameLength=length).intersect()
    showLinePattern.RenderPattern(patternFolder, folderName, imgFolder, "./frameImg/", frameLength=length).renderPattern()
    shutil.move(imgFolder, join("output", folderName.replace('/','')[5:]))
    shutil.rmtree(snapshotFolder)
    shutil.rmtree(clusterFolder)
    shutil.rmtree(patternFolder)
def dbscanV2(display, eps, threshold_num):
    pixarr = get_display_matrix(display)
    arr = numpy.array(pixarr)
    pt_locations = numpy.where(arr == 0)
    points = []

    # the weird indexing is not an accident. See
    # the documentation for numpy.where
    for i in range(len(pt_locations[0])):
        points.append((pt_locations[0][i], pt_locations[1][i]))

    clusters = cdbscan.dbscan(points, threshold_num, eps)
    return clusters
Ejemplo n.º 10
0
def main():
    n = 5000
    (X, y) = generator(n)
    X = X.astype('float32')
    y = y.astype('int32')
    y = y.reshape(1, n)

    show_result(kmeans(X, y, 2))
    show_result(spectralclustering(X, y, 2))
    show_result(agglomerativeclustering(X, y, 2))
    show_result(dbscan(X, y, 0.002, 2))
    show_result(birch(X, y, 2))
    show_result(minibatchkmeans(X, y, 2))
def dbscanV2(display, eps, threshold_num):
    pixarr = get_display_matrix(display)
    arr = numpy.array(pixarr)
    pt_locations = numpy.where(arr == 0)
    points = []

    # the weird indexing is not an accident. See
    # the documentation for numpy.where
    for i in range(len(pt_locations[0])):
        points.append((pt_locations[0][i], pt_locations[1][i]))

    clusters = cdbscan.dbscan(points, threshold_num, eps)
    return clusters
Ejemplo n.º 12
0
def run(filename):
    data = get_data_set(filename)

    assignments = dbscan.dbscan(data, 0.92, 5)

    fileout = open('result/dbscan-' + filename[7:-4] + '.txt', 'w')

    dataset = data.T

    for i in range(len(assignments)):
        fileout.write(
            str(dataset[i][0]) + ',' + str(dataset[i][1]) + ',' +
            str(assignments[i]) + '\n')

    print(' ')
Ejemplo n.º 13
0
def get_clusters(good_urls, urls, n_urls=500, my_dbs=False, verbose=False):
    """ """
    random.shuffle(good_urls)
    random.shuffle(urls)

    fit_urls = good_urls[:n_urls] + urls[:n_urls]

    mysekitei = sekitei(fit_urls, alpha=0.01)
    mysekitei.fit()

    X = mysekitei.most_freq_features()
    
    if my_dbs: py = dbscan().fit_predict(X)
    else:      py = DBSCAN().fit_predict(X)

    hist = []
    clusters = list( set(py) )
    with open('data/clusters_features.txt', 'w') as file:
        
        print >>file,  mysekitei.n_features
        print >>file, '\n\n\n', '\n'.join(mysekitei.tags_order[:mysekitei.n_features]), '\n\n'

        for c in clusters:
            hist.append(len([p for p in py if p == c]))
            # print >>f, c, ':', hist[-1]

    vizualize_clusters(X, ([1] * n_urls + [0] * n_urls), py, hist)
    
    regexpes = mysekitei.get_clusters_regexpes(X, py)

    with open('data/clusters_freq_features.txt', 'w') as file:
        print  'n_features=', mysekitei.n_features, '\n\n'
        print >>file, 'n_features=', mysekitei.n_features, '\n\n'
        for c,f,i in regexpes:
            print '---', c, '=', str(len(f))
            print '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n'
            print >>file, '---', c, '=', str(len(f))
            print >>file, '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n'

    with open('data/united_regexpes.txt', 'w') as file:
        for k,f,i in regexpes:
            rex = '^'
            for r in f[:-1]:
                rex += '(?=%s)' % r.strip('^').rstrip('$')
            rex += '%s' % f[-1].strip('^')
            print >>file, k, '=', rex

    return mysekitei, regexpes
Ejemplo n.º 14
0
    def __division_loc(self):
        # 计算每一个对象的类别号
        category_loc = dbscan(self.__historical_data, 1.5, 3,
                              FingerprintSequence.distance_loc)
        category_loc_num = int(np.max(category_loc + 1))
        if category_loc_num == 1:
            return False
        '''
        print(category_loc)
        print(category_loc_num)

        for i in range(len(self.__historical_data)):
            t = self.__historical_data[i]
            print(t.uploader, ' ', i, ' ', category_loc[i])
            pprint(t.feature_location)
            pprint(t.feature_avg)
            pprint(t.feature_std)
            pprint(t.feature_max)
        '''
        self.__category_loc_data = [0] * category_loc_num
        # 统计每一类数据数目
        category_loc_data_num = [0] * category_loc_num
        counts_loc = [0] * category_loc_num
        for i in range(len(category_loc)):
            category_loc_data_num[int(category_loc[i])] += 1
        # 初始化每一类数据的列表
        for i in range(category_loc_num):
            self.__category_loc_data[i] = [0] * category_loc_data_num[i]
        # 为每一位置类添加数据 __category_loc_data[i][j]表示第i个位置类中的第j个数据
        for i in range(category_loc.shape[0]):
            self.__category_loc_data[int(category_loc[i])][int(counts_loc[int(category_loc[i])])] = \
                self.__historical_data[i]
            counts_loc[int(category_loc[i])] += 1
        '''
        for i in range(len(self.__category_loc_data)):
            # print(len(self.__category_loc_data[i]))
            for j in range(len(self.__category_loc_data[i])):
                t = self.__category_loc_data[i][j]
                print(t.uploader, ' ', i)
                pprint(t.feature_location)
                pprint(t.feature_avg)
                pprint(t.feature_std)
                pprint(t.feature_max)
        '''

        # print(counts_loc)
        return True
def get_vectors(image, eps, threshold):
    pil_image = image
    image_array = numpy.array(pil_image)
    pt_locations = numpy.where(image_array != 0)
    points = []
    for i in range(len(pt_locations[0])):
        points.append((pt_locations[0][i], pt_locations[1][i]))
    clusters = cdbscan.dbscan(points, threshold, eps)
    images = []
    image_vectors = []
    i = 0
    for cluster in clusters:
        images.append(cluster_to_square_image(cluster))
    for image in images:
        resized = image.resize((28, 28), Image.ANTIALIAS)
        vec = numpy.array(resized).ravel()
        vec = replace_negatives(vec)
        vec = scale_to_max_val(vec, max_val=255)
        image_vectors.append(vec)
        i += 1
    return image_vectors
def get_vectors(image, eps, threshold):
    pil_image = image
    image_array = numpy.array(pil_image)
    pt_locations = numpy.where(image_array != 0)
    points = []
    for i in range(len(pt_locations[0])):
        points.append((pt_locations[0][i], pt_locations[1][i]))
    clusters = cdbscan.dbscan(points, threshold, eps)
    images = []
    image_vectors = []
    i = 0
    for cluster in clusters:
        images.append(cluster_to_square_image(cluster))
    for image in images:
        resized = image.resize((28, 28), Image.ANTIALIAS)
        vec = numpy.array(resized).ravel()
        vec = replace_negatives(vec)
        vec = scale_to_max_val(vec, max_val=255)
        image_vectors.append(vec)
        i += 1
    return image_vectors
Ejemplo n.º 17
0
def get_clusters(good_urls, urls, n_urls=500, my_dbs=False, verbose=False):
    """ """
    random.shuffle(good_urls)
    random.shuffle(urls)

    fit_urls = good_urls[:n_urls] + urls[:n_urls]

    mysekitei = sekitei(fit_urls, alpha=0.01)
    mysekitei.fit()

    X = mysekitei.most_freq_features()
    
    if my_dbs: py = dbscan().fit_predict(X)
    else:      py = DBSCAN().fit_predict(X)
        
    regexpes = mysekitei.get_clusters_regexpes(X, py)

    print  'n_features=', mysekitei.n_features, '\n\n'
    for c,f,i in regexpes:
        print '---', c, '=', str(len(f))
        print '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n'

    return mysekitei, regexpes
Ejemplo n.º 18
0
def populate_probes(probes):
    logging.debug('Populate Probes - Started.')
    for probe_id in probes:
        probe = probes[probe_id]
        shelves, noise = dbscan.dbscan(probe.products, 1, eps, dist, sort_key)
        probe.set_shelves(shelves)
        # logging.debug(str(map(lambda sh: map(lambda pr: pr.id, sh), shelves)))
        # logging.debug(str(map(lambda sh: map(lambda pr: pr.patch_url, sh), shelves)))
        probe.build_relations()

        # build matrices, not sure if it is necessary
        curr = probe.products
        n = len(curr)
        rights = np.zeros((n, n))
        lefts = np.zeros((n, n))
        for i in range(0, n):
            product = curr[i]
            for j in range(0, n):
                neighbour = curr[j].id
                rights[i][j] = int(neighbour in product.relations[rel_right])
                lefts[i][j] = int(neighbour in product.relations[rel_left])
        probe.set_rights(rights)
        probe.set_lefts(lefts)
    logging.debug('Populate Probes - Ended.')
Ejemplo n.º 19
0
def fextract(imgin):
    """ 
        Args:
            imgin - image in
    """

    # Initialize dbscan
    myDbscan = dbscan(_MYPARAMS["SIZE_OF_ROI"] / 2)

    PRINT_LOG_OUT = []
    PRINT_LOG_OUT.append("[Date]")
    PRINT_LOG_OUT.append("Date Analyzed = " + strftime("%Y-%m-%d %H:%M:%S"))
    PRINT_LOG_OUT.append("\n[Analysis Parameters]")
    # print parameters
    PRINT_LOG_OUT += [str(k) + " = " + str(_MYPARAMS[k]) for k in _MYPARAMS.keys()]

    cv2.imwrite(os.path.join("Output", _MYPARAMS["IMAGE"]), imgin)

    height, width, channels = imgin.shape
    PRINT_LOG_OUT.append("Width = " + str(width))
    PRINT_LOG_OUT.append("Height = " + str(height))

    # Check if image imported correctly
    # if (imgin == None):
    #    print "Image does not exist! Aborting...\n"
    #    return;

    hsv_imgin = cv2.cvtColor(imgin, cv2.COLOR_BGR2HSV)

    # Detect image size [rows, columns]
    _IMBND = (hsv_imgin.shape[0], hsv_imgin.shape[1])

    hsv_chans = cv2.split(hsv_imgin)
    # split image into HSV channels

    # Get both blurred and not blurred files for image processing
    if _MYPARAMS["HAS_BLUR"]:
        hsv_chans = [cv2.blur(hsvim, (_MYPARAMS["BKS"], _MYPARAMS["BKS"])) for hsvim in hsv_chans]

    # may use other feature detector for testing
    FD_TYPE = "MSER"
    PRINT_LOG_OUT.append("FD Type = " + FD_TYPE)
    print("Running MSER...")
    # delta, maxArea, minArea, maxVariation, minDiversity, maxEvolution, areaThreshold, minMargin, edgeBlurSize
    # Default is 5, 60, 14400, 0.25, 0.2, 200, 1.01, 0.003, 5
    # Decreasing maxVariation increases how sharp edges need to be
    my_fd = cv2.MSER_create(
        5,  # _delta (int)
        int(_MYPARAMS["MIN_AREA"] / 2738 * _IMBND[0]),  # _min_area (int)
        int(_MYPARAMS["MAX_AREA"] / 2738 * _IMBND[0]),  # _max_area (int)
        0.099,  # _max_variation (float)
        0.65,  # _min_diversity (float)
        200,  # _max_evolution (int)
        1.01,  # _area_threshold (double)
        0.003,  # _min_margin (double)
        5,  # _edge_blur_size (int)
    )

    # FD_TYPE = "SimpleBlob"
    # PRINT_LOG_OUT.append("FD Type: " + FD_TYPE)
    # my_fd = cv2.SimpleBlobDetector_create()

    imgClusteredRegions = copy.copy(imgin)

    PRINT_LOG_OUT.append("\n[Channel Keypoints]")

    kpts = []  # (k)ey(p)oin(t) out
    kptsSize = []
    dkpsout = []  # (d)isplay (k)ey(p)oint (out)put
    for i, im in enumerate(hsv_chans):
        local_kpt = my_fd.detect(im, None)  # local keypoints

        if len([x for x in _MYPARAMS["ACTIVE_CHANNEL"] if x == i]) > 0:
            # Outputs image of regions
            vis = im.copy()
            regions = my_fd.detectRegions(im, None)
            hulls = [cv2.convexHull(s.reshape(-1, 1, 2)) for s in regions]
            hullLocations, hullSizes = hulls2Points(hulls)
            cv2.polylines(vis, hulls, 1, (0, 255, 0))
            # cv2.imwrite(os.path.join("Output", 'region visualization' + str(i) + '.jpg'), vis)

            for j, point in enumerate(hullLocations):
                kpts.append(point)
                kptsSize.append(hullSizes[j])

        if local_kpt:
            # don't know how the third param works yet  -->
            local_dpksout = cv2.drawKeypoints(im, local_kpt, im)
            dkpsout.append([local_dpksout])  # append to master list
            # cv2.imwrite(os.path.join("Output", 'dkpsout' + str(i) + '.jpg'), local_dpksout)

            # print out num of keypoints and other info
            PRINT_LOG_OUT.append("Channel " + str(i) + " = " + str(len(local_kpt)))

    # Crop out ROIs for active_channel
    clusters, clusterSizes = myDbscan.getClusters(kpts, kptsSize)
    averagedClusters = averageClusters(clusters)
    clusterSizes = largestSize(clusters, clusterSizes)
    # Tree filter
    print("Filtering trees...")
    if _MYPARAMS["USE_TREE_FILTER"]:
        averagedClusters, clusterSizes = filterTrees(imgin, averagedClusters, clusterSizes)

    imageName = _MYPARAMS["IMAGE"].split(".")[0]

    croppedImgNames = []
    print("Cropping...")
    for i, mypoint in enumerate(averagedClusters):
        cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2
        padding = _MYPARAMS["CROP_PADDING"]
        row_crop = (
            clamp(mypoint[0] - cropSize - padding, 0, _IMBND[0]),
            clamp(mypoint[0] + cropSize + padding, 0, _IMBND[0]),
        )
        col_crop = (
            clamp(mypoint[1] - cropSize - padding, 0, _IMBND[1]),
            clamp(mypoint[1] + cropSize + padding, 0, _IMBND[1]),
        )
        new_crop = imgin[row_crop[0] : row_crop[1], col_crop[0] : col_crop[1]]
        croppedImgNames.append(imageName + "roi" + str(i) + ".jpg")
        cv2.imwrite(os.path.join("Output", croppedImgNames[i]), new_crop)

    # Log clustering info
    PRINT_LOG_OUT.append("\n[Crop Info]")
    PRINT_LOG_OUT.append("Number of Crops = " + str(len(averagedClusters)))

    # Write log for the clusters
    for i, cluster in enumerate(averagedClusters):
        PRINT_LOG_OUT.append("\n[Crop " + str(i + 1) + "]")
        PRINT_LOG_OUT.append("Image Name = " + croppedImgNames[i])
        PRINT_LOG_OUT.append("X = " + str(averagedClusters[i][1]))
        PRINT_LOG_OUT.append("Y = " + str(averagedClusters[i][0]))
        cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2
        padding = _MYPARAMS["CROP_PADDING"]
        PRINT_LOG_OUT.append("Size = " + str(2 * (cropSize + padding)))

    # Output cluster locations
    # imgClusteredRegions = drawClusters(imgClusteredRegions, clusters, averagedClusters)
    imgClusteredRegions = drawCroppedRegions(imgClusteredRegions, averagedClusters, clusterSizes)
    cv2.imwrite(os.path.join("Output", "croppedRegions.jpg"), imgClusteredRegions)

    # print result info to log file
    with open(os.path.join("Output", imageName + " .ini"), "a") as f:
        for line in PRINT_LOG_OUT:
            f.write(line + "\n")
Ejemplo n.º 20
0
def fextract(imgin):
    """ 
        Args:
            imgin - image in
    """

    # Initialize dbscan
    myDbscan = dbscan(_MYPARAMS['SIZE_OF_ROI'] / 2)

    PRINT_LOG_OUT = []
    PRINT_LOG_OUT.append("[Date]")
    PRINT_LOG_OUT.append("Date Analyzed = " + strftime("%Y-%m-%d %H:%M:%S"))
    PRINT_LOG_OUT.append("\n[Analysis Parameters]")
    # print parameters
    PRINT_LOG_OUT += [
        str(k) + " = " + str(_MYPARAMS[k]) for k in _MYPARAMS.keys()
    ]

    cv2.imwrite(os.path.join("Output", _MYPARAMS['IMAGE']), imgin)

    height, width, channels = imgin.shape
    PRINT_LOG_OUT.append("Width = " + str(width))
    PRINT_LOG_OUT.append("Height = " + str(height))

    # Check if image imported correctly
    #if (imgin == None):
    #    print "Image does not exist! Aborting...\n"
    #    return;

    hsv_imgin = cv2.cvtColor(imgin, cv2.COLOR_BGR2HSV)

    # Detect image size [rows, columns]
    _IMBND = (hsv_imgin.shape[0], hsv_imgin.shape[1])

    hsv_chans = cv2.split(hsv_imgin)
    # split image into HSV channels

    # Get both blurred and not blurred files for image processing
    if (_MYPARAMS['HAS_BLUR']):
        hsv_chans = [
            cv2.blur(hsvim, (_MYPARAMS['BKS'], _MYPARAMS['BKS']))
            for hsvim in hsv_chans
        ]

    # may use other feature detector for testing
    FD_TYPE = "MSER"
    PRINT_LOG_OUT.append("FD Type = " + FD_TYPE)
    print("Running MSER...")
    # delta, maxArea, minArea, maxVariation, minDiversity, maxEvolution, areaThreshold, minMargin, edgeBlurSize
    # Default is 5, 60, 14400, 0.25, 0.2, 200, 1.01, 0.003, 5
    # Decreasing maxVariation increases how sharp edges need to be
    my_fd = cv2.MSER_create(
        5,  # _delta (int)
        int(_MYPARAMS['MIN_AREA'] / 2738 * _IMBND[0]),  # _min_area (int)
        int(_MYPARAMS['MAX_AREA'] / 2738 * _IMBND[0]),  # _max_area (int)
        0.099,  # _max_variation (float)
        0.65,  # _min_diversity (float)
        200,  # _max_evolution (int)
        1.01,  # _area_threshold (double)
        0.003,  # _min_margin (double)
        5  # _edge_blur_size (int)
    )

    # FD_TYPE = "SimpleBlob"
    # PRINT_LOG_OUT.append("FD Type: " + FD_TYPE)
    # my_fd = cv2.SimpleBlobDetector_create()

    imgClusteredRegions = copy.copy(imgin)

    PRINT_LOG_OUT.append("\n[Channel Keypoints]")

    kpts = []  # (k)ey(p)oin(t) out
    kptsSize = []
    dkpsout = []  # (d)isplay (k)ey(p)oint (out)put
    for i, im in enumerate(hsv_chans):
        local_kpt = my_fd.detect(im, None)  # local keypoints

        if len([x for x in _MYPARAMS['ACTIVE_CHANNEL'] if x == i]) > 0:
            # Outputs image of regions
            vis = im.copy()
            regions = my_fd.detectRegions(im, None)
            hulls = [cv2.convexHull(s.reshape(-1, 1, 2)) for s in regions]
            hullLocations, hullSizes = hulls2Points(hulls)
            cv2.polylines(vis, hulls, 1, (0, 255, 0))
            #cv2.imwrite(os.path.join("Output", 'region visualization' + str(i) + '.jpg'), vis)

            for j, point in enumerate(hullLocations):
                kpts.append(point)
                kptsSize.append(hullSizes[j])

        if (local_kpt):
            # don't know how the third param works yet  -->
            local_dpksout = cv2.drawKeypoints(im, local_kpt, im)
            dkpsout.append([local_dpksout])  # append to master list
            #cv2.imwrite(os.path.join("Output", 'dkpsout' + str(i) + '.jpg'), local_dpksout)

            # print out num of keypoints and other info
            PRINT_LOG_OUT.append('Channel ' + str(i) + ' = ' +
                                 str(len(local_kpt)))

    # Crop out ROIs for active_channel
    clusters, clusterSizes = myDbscan.getClusters(kpts, kptsSize)
    averagedClusters = averageClusters(clusters)
    clusterSizes = largestSize(clusters, clusterSizes)
    # Tree filter
    print("Filtering trees...")
    if _MYPARAMS['USE_TREE_FILTER']:
        averagedClusters, clusterSizes = filterTrees(imgin, averagedClusters,
                                                     clusterSizes)

    imageName = _MYPARAMS['IMAGE'].split('.')[0]

    croppedImgNames = []
    print("Cropping...")
    for i, mypoint in enumerate(averagedClusters):
        cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][
            1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2
        padding = _MYPARAMS['CROP_PADDING']
        row_crop = (clamp(mypoint[0] - cropSize - padding, 0, _IMBND[0]),
                    clamp(mypoint[0] + cropSize + padding, 0, _IMBND[0]))
        col_crop = (clamp(mypoint[1] - cropSize - padding, 0, _IMBND[1]),
                    clamp(mypoint[1] + cropSize + padding, 0, _IMBND[1]))
        new_crop = imgin[row_crop[0]:row_crop[1], col_crop[0]:col_crop[1]]
        croppedImgNames.append(imageName + 'roi' + str(i) + '.jpg')
        cv2.imwrite(os.path.join("Output", croppedImgNames[i]), new_crop)

    # Log clustering info
    PRINT_LOG_OUT.append("\n[Crop Info]")
    PRINT_LOG_OUT.append("Number of Crops = " + str(len(averagedClusters)))

    # Write log for the clusters
    for i, cluster in enumerate(averagedClusters):
        PRINT_LOG_OUT.append("\n[Crop " + str(i + 1) + "]")
        PRINT_LOG_OUT.append("Image Name = " + croppedImgNames[i])
        PRINT_LOG_OUT.append("X = " + str(averagedClusters[i][1]))
        PRINT_LOG_OUT.append("Y = " + str(averagedClusters[i][0]))
        cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][
            1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2
        padding = _MYPARAMS['CROP_PADDING']
        PRINT_LOG_OUT.append("Size = " + str(2 * (cropSize + padding)))

    # Output cluster locations
    #imgClusteredRegions = drawClusters(imgClusteredRegions, clusters, averagedClusters)
    imgClusteredRegions = drawCroppedRegions(imgClusteredRegions,
                                             averagedClusters, clusterSizes)
    cv2.imwrite(os.path.join("Output", 'croppedRegions.jpg'),
                imgClusteredRegions)

    # print result info to log file
    with open(os.path.join("Output", imageName + ' .ini'), 'a') as f:
        for line in PRINT_LOG_OUT:
            f.write(line + '\n')
Ejemplo n.º 21
0
    def __division_rssi(self):
        # 总位置类数目
        category_loc_num = len(self.__category_loc_data)
        category_loc_rssi = [None] * category_loc_num
        # print(category_loc_num)
        # print(category_loc_rssi)
        # 计算各个位置类中 各个数据对应的rssi类序号,每个位置类中rssi类序号都从0开始
        for i in range(1, len(self.__category_loc_data)):
            # category_loc_rssi[i][j] 第i位置类中第j个样本的rssi类序号
            category_loc_rssi[i] = dbscan(self.__category_loc_data[i], 5, 2,
                                          FingerprintSequence.distance_rssi)
            # print(category_loc_rssi[i])
        self.__category_loc_rssi_data = [None] * category_loc_num

        # 统计每个位置类中 rssi类各个类别的数据数目
        # category_loc_rssi_data_num[i][j] 第i个位置类中第j个rssi类中数据的数目
        category_loc_rssi_data_num = [[] for i in range(category_loc_num)]
        for i in range(1, category_loc_num):
            # 第i个位置类中 所有rssi类数据数目初始化为0
            category_loc_rssi_data_num[i] = [
                0
            ] * int(np.max(category_loc_rssi[i]) + 1)
            for j in range(len(category_loc_rssi[i])):
                # 第i个位置类中 第j个数据对应的rssi类数目加1
                category_loc_rssi_data_num[i][int(
                    category_loc_rssi[i][j])] += 1
        # print(category_loc_rssi_data_num)

        # 为每个位置类中的rssi类列表分配空间
        for i in range(1, len(self.__category_loc_data)):
            # 第i位置类中rssi类列表初始化
            self.__category_loc_rssi_data[i] = [
                [] for i in range(len(category_loc_rssi_data_num[i]))
            ]
            # 第i位置类中第j个rssi类列表初始化
            for j in range(1, len(self.__category_loc_rssi_data[i])):
                self.__category_loc_rssi_data[i][j] = [
                    [] for i in range(category_loc_rssi_data_num[i][j])
                ]
        # print(self.__category_loc_rssi_data)
        # 开始向category_loc_rssi_data写入数据
        # __category_loc_rssi_data[i][j][k] 表示第i个位置类中第j个rssi类中第k个数据
        # count_loc_rssi[i][j]表示i位置类 j rssi类中已写入的数据数目
        count_loc_rssi = [[0] * len(category_loc_rssi_data_num[i])
                          for i in range(category_loc_num)]
        # print(count_loc_rssi)
        # print()

        for i in range(1, category_loc_num):
            for j in range(len(category_loc_rssi[i])):
                ca_loc_rssi = int(category_loc_rssi[i][j])
                # print(i, ' ', ca_loc_rssi)
                # print(len(self.__category_loc_rssi_data[i]), ' ', len(self.__category_loc_rssi_data[i][ca_loc_rssi]))
                if ca_loc_rssi > 0:
                    self.__category_loc_rssi_data[i][ca_loc_rssi][count_loc_rssi[i][ca_loc_rssi]] = \
                        self.__category_loc_data[i][j]
                    count_loc_rssi[i][ca_loc_rssi] += 1

        print(count_loc_rssi)
        # 开始为每个位置rssi类建立对象
        for i in range(1, category_loc_num):
            for j in range(1, len(category_loc_rssi_data_num[i])):
                # print(i, ' ', j)
                self.__categories.append(
                    Category(self.__category_loc_rssi_data[i][j]))
        print(len(self.__categories))
        for i in range(len(self.__categories)):
            for j in range(len(self.__categories[i].data)):
                print(self.__categories[i].data[j].uploader, ' ',
                      self.__categories[i].data[j].feature_location, ' ', i)

        return True
Ejemplo n.º 22
0
#####
'''
text_vectors = load('./text_vectors.p')
top = {}
for i in range(len(text_vectors)):
    score = []
    for j in range(len(text_vectors)):
        if i != j:
            score.append((1-cosine_simailarity(text_vectors[i],text_vectors[j])))
    score.sort()
    top[i] = score[:10]
'''
####
text_vectors = load('./text_vectors.p')
text_vectors = text_vectors[:1000]
labels = dbscan(text_vectors, eps=.1, minPts=10)
save('labels.p',labels)

#get unique labels
n_labels = set(labels)

#create empty clusters
clusters = {}
for n in n_labels:
    if n == -1:
        continue
    else:
        clusters[n] = []

#assign each abstract to a vector based on the labels       
for i in range(len(labels)):
def c_cluster_points(points, num_points, points_dist):
    return cdbscan.dbscan(points, num_points, points_dist)
Ejemplo n.º 24
0
def getCWTPeaks(scaledCWT,
                Y,
                noiseEst,
                minSNR=3,
                minRow=3,
                minClust=4,
                EPS=None):
    '''
    returns: N.array(peakLoc), cwtPeakLoc, cClass, boolean

    scaledCWT is the continuous wavelet transform provided by cwtMS function
    minRow is the first row of the cwt to pick peaks from--if this is too small you'll get
    too many peaks and the algorithm will choke.  Keep in mind that the first few rows of
    the CWT are highly correlated with high frequency noise--you don't want them anyway.
    '''
    cwtPeakLoc = []
    print "Shape: ", scaledCWT.shape

    revRowArray = N.arange((scaledCWT.shape[0] - 1), 1, -1)  #steps backwards
    for i in revRowArray:
        row = scaledCWT[i]
        if i > minRow:
            normRow = normalize(row)
            rowDeriv = derivative(normRow)
            t3 = time.clock()
            'criterion 1 -- above the threshold and a zero crossing in the derivative'

            criterion = (rowDeriv < 0.5) & (rowDeriv > -0.5) & (normRow >=
                                                                minNoise)
            tempLocEst = N.where(criterion)[0]

            for m in tempLocEst[:
                                -1]:  #need to exclude last element so we don't get an IndexError for the rowDeriv array
                if N.sign(rowDeriv[m]) > N.sign(rowDeriv[m + 1]):
                    if normRow[m] >= noiseEst[m]:
                        cwtPeakLoc.append([m, i])

            print "Zero Crossing", time.clock() - t3

    cwtPeakLoc = N.array(cwtPeakLoc)
    #    ax2.plot(cwtPeakLoc[:,0], cwtPeakLoc[:,1], 'oy', alpha = 0.4)
    #    print 'Peak Finding: ', time.clock() - t2

    peakLoc = []

    t3 = time.clock()
    try:
        cClass, tType, Eps, boolAns = dbscan(cwtPeakLoc, minClust, Eps=EPS)
    except:
        errorMsg = 'dbscan error...is your CWT huge?\n'
        errorMsg += "Sorry: %s\n\n%s\n" % (sys.exc_type, sys.exc_value)
        print errorMsg
        return None, None, None, False

    print 'Peak Cluster: ', time.clock() - t3
    if boolAns:
        print cClass.max(), len(tType), Eps
        i = cClass.max()
        for m in xrange(int(i) + 1):
            ind = N.where(m == cClass)
            temp = cwtPeakLoc[ind]

            #            ax2.plot(temp[:,0],temp[:,1],'-s', alpha = 0.7, ms = 3)
            if len(temp) > 0:
                sortInd = temp[:, 0].argsort()
                temp = temp[sortInd]
                tempDiffX = N.diff(temp[:, 0])
                tempDiffY = N.diff(temp[:, 1])
                diffSumX = tempDiffX.sum()
                diffSumY = tempDiffY.sum()
                print tempDiffX, diffSumX
                print tempDiffY, diffSumY

                #        if diffSumX <= len(tempDiffX)*2:
                i = 0
                rowThresh = 3
                pntPad = 50
                staticCut = 0.2
                for j in tempDiffY:
                    if j <= rowThresh:
                        i += 1
                    else:
                        i += -1
#                if i >= rowThresh:
                if tempDiffY.mean() <= rowThresh:
                    maxInd = temp[:, 1].argmin()
                    xVal = temp[maxInd][0]

                    #this screening assumes there is a low value to the first
                    #scale value e.g. 1 or 2

                    tempVals = Y[(xVal - pntPad):xVal]
                    if len(tempVals) > 0:
                        localMaxInd = tempVals.argmax()
                        #                    print localMaxInd
                        yMaxInd = xVal - pntPad + localMaxInd
                        #                    if Y[xVal] >= noiseEst[xVal]:
                        if Y[yMaxInd] >= noiseEst[yMaxInd] * minSNR / 2 and Y[
                                yMaxInd] >= staticCut:
                            #                    if Y[xVal]>=scaledCWT[0][xVal]*minSNR/2 and Y[xVal] >= noiseEst[xVal]*minSNR/2:
                            peakLoc.append(x[yMaxInd])
                            print "Appended, %s\n" % x[yMaxInd]
                        else:
                            print "too low @ %s\n" % x[xVal]
                else:
                    print "\n"


#                    print x[xVal]

    return N.array(peakLoc), cwtPeakLoc, cClass, True
Ejemplo n.º 25
0
from sklearn.preprocessing import StandardScaler


##############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                            random_state=0)

X = StandardScaler().fit_transform(X)
X = X.astype(np.float32)

##############################################################################
# Compute DBSCAN
import dbscan
labels = np.array(dbscan.dbscan(X, "sparse").run(0.3, 10))
core_samples_mask = np.zeros_like(labels, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)

##############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
evaluateClassification(yClass_test,
                       yClass_knn,
                       'K-nearest neighbors',
                       displayDetailedView=1)
plotDecisionBoundry(X_test, yClass_test, yClass_knn, 'K-nearest neighbors')

evaluateRegression(yReg_test, yReg_lr, 'Linear Regression')
evaluateRegression(yReg_test, yReg_nr, 'KNN Regression')

X, yClass, yReg = processData(trainTestSplit=2)
classes = ['German Shepherd', 'Daschhund', 'Samoyed', 'Siberian Husky']

y_kmeans = kmeans(X, init='random')
evaluateClustering(X, yClass.values.ravel(), y_kmeans, 'K-means Clustering',
                   classes)

y_dbscan = dbscan(X)
evaluateClustering(X, yClass.values.ravel(), y_dbscan, 'DBSCAN Clustering',
                   classes)

y_agglomerative = agglomerative(X)
evaluateClustering(X, yClass.values.ravel(), y_agglomerative,
                   'Agglomerative Clustering', classes)

vizualizeData(X_lr, yClass_lr.ravel(), X_rf, yClass_rf.ravel(), X_knn,
              yClass_knn.ravel(), X_linReg, yReg_lr.ravel(), X_knnReg,
              yReg_nr.ravel(), y_kmeans, y_dbscan, y_agglomerative)

print('\nFINAL VERDICT:')
print('\nAccuracy(lr, rf, knn): ', lrAc, rfAc, knnAc)
print('\nScore(lr, nr): ', lrSc, nrSc)
                usagearrays[i, j] = usagearrays[i, j] - avg
    matrice = cosine_similarity(usagearrays)
    matrice = (1. - matrice) / 2.
    return matrice


##################################################################################
ratings = pd.read_csv('ua.base',
                      sep='\t',
                      names=['user', 'movie', 'rating', 'time'])
usagematrix = ratings.pivot_table(index='user',
                                  columns='movie',
                                  values='rating')
usagematrix = usagematrix.apply(
    lambda usagematrix: usagematrix.fillna(usagematrix.mean()), axis=1)
matrice = creatMatrice(usagematrix.values)

dbscan_instance = dbscan(matrice,
                         0.4,
                         20,
                         ccore=False,
                         data_type='distance_matrix')
dbscan_instance.process()
clusters = dbscan_instance.get_clusters()
noise = dbscan_instance.get_noise()

print(len(clusters))
print(len(noise))

MAE_RMSE(ratings, clusters, 'ua.test')
Ejemplo n.º 28
0
def main():
    heading = 0
    latitude = ""
    longitude = ""
    altitude = ""
    with open(_MYPARAMS['GPS_LOG'], 'rb') as gpsfile:
        spamreader = csv.reader(gpsfile, delimiter=',', quotechar='|')
        for row in spamreader:
            found = False
            count = 0
            for name in row:
                count = count + 1
                if name == os.path.basename(_MYPARAMS['IMAGE']):
                    found = True
                if found and count == 5:
                    heading = float(name)
                if found and count == 2:
                    latitude = name
                if found and count == 3:
                    longitude = name
                if found and count == 4:
                    altitude = name
    headingString = headingToString(heading)

    # Initialize dbscan
    myDbscan = dbscan(_MYPARAMS['SIZE_OF_ROI'] / 2)

    PRINT_LOG_OUT = []
    PRINT_LOG_OUT.append("[Date]")
    PRINT_LOG_OUT.append("Date Analyzed = " + strftime("%Y-%m-%d %H:%M:%S"))
    PRINT_LOG_OUT.append(
        "\n[Position]")  # For some reason this code went missing in a commit
    PRINT_LOG_OUT += ["heading = " + headingString]
    PRINT_LOG_OUT += ["headingDegrees = " + str(heading)]
    PRINT_LOG_OUT += ["latitude = " + latitude]
    PRINT_LOG_OUT += ["longitude = " + longitude]
    PRINT_LOG_OUT += ["altitude = " + altitude]
    PRINT_LOG_OUT.append("\n[Analysis Parameters]")
    # print parameters
    PRINT_LOG_OUT += [
        str(k) + " = " + str(_MYPARAMS[k]) for k in _MYPARAMS.keys()
    ]

    # output folder
    #fileList = os.listdir(_MYPARAMS['OUTPUT_FOLDER'])
    #for fileName in fileList:
    #    os.remove(_MYPARAMS['OUTPUT_FOLDER'] + "/"+fileName)

    # import image from file
    print("Loading Image...")
    imgin = cv2.imread(_MYPARAMS['IMAGE'], cv2.IMREAD_COLOR)

    cv2.imwrite(
        os.path.join(_MYPARAMS['OUTPUT_FOLDER'],
                     os.path.basename(_MYPARAMS['IMAGE'])),
        compressImage(imgin))

    height, width, channels = imgin.shape
    PRINT_LOG_OUT.append("Width = " + str(width))
    PRINT_LOG_OUT.append("Height = " + str(height))

    # Check if image imported correctly
    #if (imgin == None):
    #    print "Image does not exist! Aborting...\n"
    #    return;

    hsv_imgin = cv2.cvtColor(imgin, cv2.COLOR_BGR2HSV)

    # Detect image size [rows, columns]
    _IMBND = (hsv_imgin.shape[0], hsv_imgin.shape[1])

    hsv_chans = cv2.split(hsv_imgin)
    # split image into HSV channels

    # Get both blurred and not blurred files for image processing
    if (_MYPARAMS['HAS_BLUR']):
        hsv_chans = [
            cv2.blur(hsvim, (_MYPARAMS['BKS'], _MYPARAMS['BKS']))
            for hsvim in hsv_chans
        ]

    # may use other feature detector for testing
    FD_TYPE = "MSER"
    PRINT_LOG_OUT.append("FD Type = " + FD_TYPE)
    print("Running MSER...")
    # delta, maxArea, minArea, maxVariation, minDiversity, maxEvolution, areaThreshold, minMargin, edgeBlurSize
    # Decreasing maxVariation increases how sharp edges need to be
    my_fd = cv2.MSER_create(
        5, _MYPARAMS['MIN_AREA'] / 2738 * _IMBND[0],
        _MYPARAMS['MAX_AREA'] / 2738 * _IMBND[0], 0.099, 0.65, 200, 1.01,
        0.003, 5)  # Default is 5, 60, 14400, 0.25, 0.2, 200, 1.01, 0.003, 5

    # FD_TYPE = "SimpleBlob"
    # PRINT_LOG_OUT.append("FD Type: " + FD_TYPE)
    # my_fd = cv2.SimpleBlobDetector_create()

    imgClusteredRegions = copy.copy(imgin)

    PRINT_LOG_OUT.append("\n[Channel Keypoints]")

    kpts = []  # (k)ey(p)oin(t) out
    kptsSize = []
    dkpsout = []  # (d)isplay (k)ey(p)oint (out)put
    for i, im in enumerate(hsv_chans):
        local_kpt = my_fd.detect(im, None)  # local keypoints

        if len([x for x in _MYPARAMS['ACTIVE_CHANNEL'] if x == i]) > 0:
            # Outputs image of regions
            vis = im.copy()
            regions = my_fd.detectRegions(im, None)
            hulls = [cv2.convexHull(s.reshape(-1, 1, 2)) for s in regions]
            hullLocations, hullSizes = hulls2Points(hulls)
            cv2.polylines(vis, hulls, 1, (0, 255, 0))
            #cv2.imwrite(os.path.join(_MYPARAMS['OUTPUT_FOLDER'], 'region visualization' + str(i) + '.jpg'), vis)

            for j, point in enumerate(hullLocations):
                kpts.append(point)
                kptsSize.append(hullSizes[j])

        if (local_kpt):
            # don't know how the third param works yet  -->
            local_dpksout = cv2.drawKeypoints(im, local_kpt, im)
            dkpsout.append([local_dpksout])  # append to master list
            #cv2.imwrite(os.path.join(_MYPARAMS['OUTPUT_FOLDER'], 'dkpsout' + str(i) + '.jpg'), local_dpksout)

            # print out num of keypoints and other info
            PRINT_LOG_OUT.append('Channel ' + str(i) + ' = ' +
                                 str(len(local_kpt)))

    # Crop out ROIs for active_channel
    clusters, clusterSizes = myDbscan.getClusters(kpts, kptsSize)
    averagedClusters = averageClusters(clusters)
    clusterSizes = largestSize(clusters, clusterSizes)
    # Tree filter
    print("Filtering trees...")
    if _MYPARAMS['USE_TREE_FILTER']:
        averagedClusters, clusterSizes = filterTrees(imgin, averagedClusters,
                                                     clusterSizes)

    imageName = os.path.basename(_MYPARAMS['IMAGE']).split('.')[0]

    croppedImgNames = []
    print("Cropping...")
    for i, mypoint in enumerate(averagedClusters):
        cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][
            1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2
        padding = _MYPARAMS['CROP_PADDING']
        row_crop = (clamp(mypoint[0] - cropSize - padding, 0, _IMBND[0]),
                    clamp(mypoint[0] + cropSize + padding, 0, _IMBND[0]))
        col_crop = (clamp(mypoint[1] - cropSize - padding, 0, _IMBND[1]),
                    clamp(mypoint[1] + cropSize + padding, 0, _IMBND[1]))
        new_crop = imgin[row_crop[0]:row_crop[1], col_crop[0]:col_crop[1]]
        croppedImgNames.append(imageName + 'roi' + str(i) + '.jpg')
        cv2.imwrite(
            os.path.join(_MYPARAMS['OUTPUT_FOLDER'], croppedImgNames[i]),
            new_crop)

    # Log clustering info
    PRINT_LOG_OUT.append("\n[Crop Info]")
    PRINT_LOG_OUT.append("Number of Crops = " + str(len(averagedClusters)))

    # Write log for the clusters
    for i, cluster in enumerate(averagedClusters):
        PRINT_LOG_OUT.append("\n[Crop " + str(i + 1) + "]")
        PRINT_LOG_OUT.append("Image Name = " + croppedImgNames[i])
        PRINT_LOG_OUT.append("X = " + str(averagedClusters[i][1]))
        PRINT_LOG_OUT.append("Y = " + str(averagedClusters[i][0]))
        cropSize = clusterSizes[i][1] / 2 if clusterSizes[i][
            1] / 2 > clusterSizes[i][0] / 2 else clusterSizes[i][0] / 2
        padding = _MYPARAMS['CROP_PADDING']
        PRINT_LOG_OUT.append("Size = " + str(2 * (cropSize + padding)))

    # Output cluster locations
    #imgClusteredRegions = drawClusters(imgClusteredRegions, clusters, averagedClusters)
    imgClusteredRegions = drawCroppedRegions(imgClusteredRegions,
                                             averagedClusters, clusterSizes)
    cv2.imwrite(os.path.join(_MYPARAMS['OUTPUT_FOLDER'], 'croppedRegions.jpg'),
                imgClusteredRegions)

    # print result info to log file
    with open(
            os.path.join(_MYPARAMS['OUTPUT_FOLDER'],
                         imageName + ' Results.ini'), 'a') as f:
        for line in PRINT_LOG_OUT:
            f.write(line + '\n')
Ejemplo n.º 29
0
def main():
    # process command line arguments and return arguments as args
    args = run_parser()

    # load or build datasets according to arguments
    datasets = ready_datasets(args)

    # build experiment object, including datasets
    exp = experiment(datasets, settings.algorithms)

    # calculate distances for each dataset
    calculate_distances(exp)

    # run an experiment with all algorithms and datasets
    if args.experiment:
        run_experiment(exp)

    # run k-means algorithm on specified datasets
    if args.kmeans:
        clusters = run_kbrain(settings.k[0], "k-means", exp.datasets[0])
        exp.results["k-means"].append(
            (exp.datasets[0].name, settings.maxSamples, 1, settings.k[0],
             clusters))

    if args.kmedoids:
        clusters = run_kbrain(settings.k[0], "k-medoids", exp.datasets[0])
        exp.results["k-medoids"].append(
            (exp.datasets[0].name, settings.maxSamples, 1, settings.k[0],
             clusters))

    if args.dbscan:
        # call dbscan wrapper function
        results = dbscan(
            exp.datasets[0],
            settings.maxSamples,
            settings.epsilons[0],
            settings.minPts[0],
        )
        # save results of each experiment
        exp.results["DBSCAN"].append((
            exp.datasets[0].name,
            settings.maxSamples,
            1,
            settings.epsilons[0],
            settings.minPts[0],
            results,
        ))

    # compile results into a dataframe
    resultsDF = compile_results(exp)

    # print(resultsDF.drop(columns=["cluster_list"]))

    # calculate accuracy of our clustering algorithms' results
    # compared to sklearn.dataset dataset labels
    calculate_groundtruth_accuracy(resultsDF, exp)

    # calculate accuracy of our clustering algorithms' results
    # compared to sklearn clustering algorithm labels
    # calculate_sklearn_accuracy(resultsDF, exp)

    print(resultsDF.drop(columns=["cluster_list", "dataset"]))

    save_results(resultsDF)
Ejemplo n.º 30
0
def run_experiment(exp):
    # loop through each clustering algorithm
    for algo in exp.algorithms:

        # loop through each dataset
        for ds in exp.datasets:

            # loop through the number of datapoints
            # to be used
            for num in settings.numSamples:

                # loop for each trial run
                for i in range(1, settings.numRuns + 1):
                    print(
                        "algo: {0}, ds: {1}, size: {2}".format(
                            algo, ds.name, num),
                        end="",
                    )
                    startTime = time.perf_counter()

                    # call dbscan with parameters
                    if algo == "DBSCAN":

                        # loop parameters unique to dbscan
                        for eps in settings.epsilons:
                            for mp in settings.minPts:

                                # call dbscan with parameters
                                results = dbscan(ds, num, eps, mp)
                                # save results of each experiment
                                exp.results[algo].append(
                                    (ds.name, num, i, eps, mp, results))

                    if algo == "k-means":

                        for k in range(3, 5):
                            clusters = run_kbrain(k, algo, ds)
                            exp.results[algo].append(
                                (ds.name, num, i, k, clusters))

                    if algo == "k-medoids":

                        for k in range(3, 5):
                            clusters = run_kbrain(k, algo, ds)
                            exp.results[algo].append(
                                (ds.name, num, i, k, clusters))

                    if algo == "sklearn_kmeans":
                        for numClusters in range(3, 5):
                            results = sklearn_kmeans(ds, numClusters, num)

                            exp.results[algo].append(
                                (ds.name, num, i, numClusters, results))

                    if algo == "sklearn_kmedoids":
                        for numClusters in range(3, 5):
                            results = sklearn_kmedoids(ds, numClusters, num)

                            exp.results[algo].append(
                                (ds.name, num, i, numClusters, results))

                    if algo == "sklearn_dbscan":
                        # loop parameters unique to dbscan
                        for eps in settings.epsilons:
                            for mp in settings.minPts:

                                # call dbscan with parameters
                                results = sklearn_dbscan(ds, num, eps, mp)

                                # save results of each experiment
                                exp.results[algo].append(
                                    (ds.name, num, i, eps, mp, results))

                    stopTime = time.perf_counter()

                    print(" {0:3.2} minutes".format(
                        (stopTime - startTime) / 60))
Ejemplo n.º 31
0
def getCWTPeaks(scaledCWT, Y, noiseEst, minSNR = 3, minRow = 3, minClust = 4, EPS = None):
    '''
    returns: N.array(peakLoc), cwtPeakLoc, cClass, boolean

    scaledCWT is the continuous wavelet transform provided by cwtMS function
    minRow is the first row of the cwt to pick peaks from--if this is too small you'll get
    too many peaks and the algorithm will choke.  Keep in mind that the first few rows of
    the CWT are highly correlated with high frequency noise--you don't want them anyway.
    '''
    cwtPeakLoc = []
    print "Shape: ",scaledCWT.shape

    revRowArray = N.arange((scaledCWT.shape[0]-1),1,-1)#steps backwards
    for i in revRowArray:
        row = scaledCWT[i]
        if i> minRow:
            normRow = normalize(row)
            rowDeriv = derivative(normRow)
            t3 = time.clock()
            'criterion 1 -- above the threshold and a zero crossing in the derivative'

            criterion = (rowDeriv < 0.5) & (rowDeriv > -0.5) & (normRow >= minNoise)
            tempLocEst = N.where(criterion)[0]

            for m in tempLocEst[:-1]:#need to exclude last element so we don't get an IndexError for the rowDeriv array
                if N.sign(rowDeriv[m]) > N.sign(rowDeriv[m+1]):
                    if normRow[m] >= noiseEst[m]:
                        cwtPeakLoc.append([m,i])

            print "Zero Crossing", time.clock()-t3


    cwtPeakLoc = N.array(cwtPeakLoc)
#    ax2.plot(cwtPeakLoc[:,0], cwtPeakLoc[:,1], 'oy', alpha = 0.4)
#    print 'Peak Finding: ', time.clock() - t2

    peakLoc = []

    t3 = time.clock()
    try:
        cClass, tType, Eps, boolAns = dbscan(cwtPeakLoc, minClust, Eps = EPS)
    except:
        errorMsg ='dbscan error...is your CWT huge?\n'
        errorMsg += "Sorry: %s\n\n%s\n"%(sys.exc_type, sys.exc_value)
        print errorMsg
        return None, None, None, False


    print 'Peak Cluster: ', time.clock() - t3
    if boolAns:
        print cClass.max(), len(tType), Eps
        i = cClass.max()
        for m in xrange(int(i)+1):
            ind = N.where(m == cClass)
            temp = cwtPeakLoc[ind]

#            ax2.plot(temp[:,0],temp[:,1],'-s', alpha = 0.7, ms = 3)
            if len(temp) > 0:
                sortInd = temp[:,0].argsort()
                temp = temp[sortInd]
                tempDiffX = N.diff(temp[:,0])
                tempDiffY = N.diff(temp[:,1])
                diffSumX = tempDiffX.sum()
                diffSumY = tempDiffY.sum()
                print tempDiffX, diffSumX
                print tempDiffY, diffSumY


        #        if diffSumX <= len(tempDiffX)*2:
                i = 0
                rowThresh = 3
                pntPad = 50
                staticCut = 0.2
                for j in tempDiffY:
                    if j <= rowThresh:
                        i+=1
                    else:
                        i+=-1
#                if i >= rowThresh:
                if tempDiffY.mean() <= rowThresh:
                    maxInd = temp[:,1].argmin()
                    xVal = temp[maxInd][0]

                    #this screening assumes there is a low value to the first
                    #scale value e.g. 1 or 2

                    tempVals = Y[(xVal-pntPad):xVal]
                    if len(tempVals)>0:
                        localMaxInd = tempVals.argmax()
    #                    print localMaxInd
                        yMaxInd = xVal-pntPad+localMaxInd
    #                    if Y[xVal] >= noiseEst[xVal]:
                        if Y[yMaxInd] >= noiseEst[yMaxInd]*minSNR/2 and Y[yMaxInd] >= staticCut:
    #                    if Y[xVal]>=scaledCWT[0][xVal]*minSNR/2 and Y[xVal] >= noiseEst[xVal]*minSNR/2:
                            peakLoc.append(x[yMaxInd])
                            print "Appended, %s\n"%x[yMaxInd]
                        else:
                            print "too low @ %s\n"%x[xVal]
                else:
                    print "\n"
#                    print x[xVal]

    return N.array(peakLoc), cwtPeakLoc, cClass, True
Ejemplo n.º 32
0
                maxVal = max(maxVal, z)
    inputs = rescaleSet(data, minVal, maxVal, 0, 1)  # Scale input data
    (coh, sep) = evalCluster(inputs)
    return (coh, sep)

if __name__ == '__main__':
    variables = (sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
    #            Type         DataSet      numParam     iterations   other       
    
    data = getInputData('DataSets/' + str(variables[1]))
    print("DataSet:", str(variables[1]))
    print("\nInputs:\n" + str(data) + '\n')
    if variables[0] == 'K':
        clusters = kmeans.kmeans(data, int(variables[2]), int(variables[3]))
    elif variables[0] == 'D':
        clusters = dbscan.dbscan(data, 15)
    elif variables[0] == 'C':
        clusters = compLearn.compLearn(data, int(variables[2]), int(variables[3]), float(variables[4]))
    elif variables[0] == 'A':
        clusters = ACO.ACO(data, int(variables[2]), int(variables[3]))
    elif variables[0] == 'P':
        clusters = PSO.PSO(data, int(variables[2]), int(variables[3]))
    else:
        print('Unknown function specified...')
        sys.exit(1)
    print("\nClusters:\n" + str(clusters))
    (coh, sep) = main(clusters)
    print("\nNumClusters:", len(clusters))
    print("\nNumPerCluster:", [len(x) for x in clusters])
    print("\nCohesion:", coh)
    print("\nSeperation: ", sep)
Ejemplo n.º 33
0
 def _create_dbscan(self, sample_data, *args):
     return dbscan.dbscan(sample_data, *args)
                         figsize=(8, 12),
                         sharey=True)
for i, dataset in enumerate(list_dataset):
    print("Dataset: {}".format(dataset))
    for j, model in enumerate(models):
        np.random.seed(31)
        if j == 0:
            axes[i, j].set_ylabel(dataset)
        if dataset == "blobs":
            if model == "K Means":
                mod = kmeans.kmeans(ncluster=3, initialization='kmeans++')
            elif model == "GaussianMM":
                mod = gaussianmm.gaussianmm(ncluster=3,
                                            initialization='kmeans++')
            elif model == "DBSCAN":
                mod = dbscan.dbscan(minpts=5, epsilon=0.18)
        elif dataset == "varied_blobs1":
            if model == "K Means":
                mod = kmeans.kmeans(ncluster=3, initialization='kmeans++')
            elif model == "GaussianMM":
                mod = gaussianmm.gaussianmm(ncluster=3,
                                            initialization='kmeans++')
            elif model == "DBSCAN":
                mod = dbscan.dbscan(minpts=5, epsilon=0.18)
        elif dataset == "varied_blobs2":
            if model == "K Means":
                mod = kmeans.kmeans(ncluster=3, initialization='kmeans++')
            elif model == "GaussianMM":
                mod = gaussianmm.gaussianmm(ncluster=3,
                                            initialization='kmeans++')
            elif model == "DBSCAN":
Ejemplo n.º 35
0
 def _create_dbscan(self, sample_data, *args):
     return dbscan.dbscan(sample_data, *args)
Ejemplo n.º 36
0
def _dbscan_metrics_comparison(metrics):
    dbscan.dbscan(metrics, 1000, 2)
Ejemplo n.º 37
0
    copy.deepcopy(max(dataset, key=lambda d: d.tuple[i]))
    for i in xrange(0, cls_idx)
]

f_norm = open("normData.txt", 'w')
for d in dataset:
    for i in xrange(0, cls_idx):
        d.tuple[i] = float(d.tuple[i] - mins[i].tuple[i]) / (maxs[i].tuple[i] -
                                                             mins[i].tuple[i])
    f_norm.write(str(d.tuple))
    f_norm.write('\n')

k = int(sys.argv[1])
eps = float(sys.argv[2])

cluster = dbscan.dbscan(dataset, eps, k)
if len(cluster) == 0:
    print 'k:', k, 'no. of cluster:', len(cluster)
    print

pure = evaluation.purity(cluster, len(dataset))
NMI = evaluation.NMI(cluster, dataset)
RI = evaluation.RI(cluster, dataset)

cp = [len(c) for c in cluster]

f_out = open("output.txt", 'w')

for i in xrange(0, len(cluster)):
    print 'cluster:', i, 'no. of pt. in cluster:', cp[i]
    for c in cluster[i]:
Ejemplo n.º 38
0
    data = f.read()

data = data.split("\n")
data = data[1:]

data = [ i.split(" ") for i in data ]


# x data
u_raw = [ i[2] for i in filter(lambda x: len(x) > 4, data) ]

# y data
v_raw = [ i[3] for i in filter(lambda x: len(x) > 4, data) ]


# set data
datax = np.array(v_raw, dtype=np.float64)
datay = np.array(u_raw, dtype=np.float64)

xmin = np.min(datax)
xmax = np.max(datax)
ymin = np.min(datay)
ymax = np.max(datay)

minPoints = 4
epsilon = 40.0

dbs = dbscan(datax, datay, minPoints, epsilon)

dbs.plot(xmin, xmax, ymin, ymax)
# (1) generate data
nsample = 32000
case = "varied_blobs1"
X = create_dataset_sklearn.create_dataset(nsample,case)
array_ndim = np.array([500, 1000, 2000, 4000, 8000, 16000, 32000])
array_time = np.zeros((np.size(array_ndim)))

# (2) generate time data
nrun = 2
for idx in range(np.size(array_ndim)):
    for _ in range(nrun):
        # (2) create model
        minpt = 5
        epsilon = 0.18
        model = dbscan.dbscan(minpt,epsilon,animation=False)
        # (3) fit model
        ndim = array_ndim[idx]
        model.fit(X[:,0:ndim])
        array_time[idx] += model.time_fit
    print("Dimension: {}  Time Fit: {}".format(ndim,array_time[idx]))
# determine power
log_ndim = np.log(array_ndim)
log_time = np.log(array_time)
coeff = np.polyfit(log_ndim,log_time,1)
p = np.poly1d(coeff)
plogndim = p(log_ndim)
print("Power: {}".format(coeff[0]))
plt.figure()
plt.plot(log_ndim,log_time,"ro",label="Data")
plt.plot(log_ndim,plogndim,"b-",label="Fit")
Ejemplo n.º 40
0
__author__ = 'arno'
import dbscan
from numpy import ndarray
from numpy import array

#some dummy data
dummydaten = array([[1, 1.5], [1, 1.2], [0.9, 1.2], [8.2, 1.0], [8.3, 0.7], [9.2, 0.7], [-3.3, 5]])

#put in the data we want to use
minNeighbors = 1
epsilon = 0.85

data = dummydaten

#use dbscan
dbscan.dbscan(data, epsilon, minNeighbors)
Ejemplo n.º 41
0
import plot_data

# (1) load data
iris = data_iris.iris()
X, class_label = iris.load()
# perform pca and reduce dimension to 2
model_pca = pca.pca()
model_pca.fit(X)
R = model_pca.data_reduced_dimension(reduced_dim=2)
plot_data.plot_scatter_class(R, class_label,
                             "Iris Data Projected to 2 Dimensions using PCA",
                             "u0", "u1")
# (2) create model
minpts = 4
epsilon = 0.4
model = dbscan.dbscan(minpts, epsilon)
# (3) fit model
model.fit(R)
print("Time fit: {}".format(model.time_fit))
# (4) results
level = -1
print("Purity: {}".format(metrics.purity(model.clustersave[level],
                                         class_label)))
print("Davies-Bouldin: {}".format(
    metrics.davies_bouldin(R, model.clustersave[level])))
print("Silhouette: {}".format(metrics.silhouette(R, model.clustersave[level])))
model.plot_cluster(nlevel=level,
                   title="DBSCAN Clustering for Iris Dataset reduced to 2d",
                   xlabel="u0",
                   ylabel="u1")
metrics.plot_cluster_distribution(model.clustersave[level], class_label)
Ejemplo n.º 42
0
from dbscan import dbscan 

# Create three gaussian blobs to use as our clustering data.
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                            random_state=0)

X = StandardScaler().fit_transform(X)

###############################################################################
# My implementation of DBSCAN
#

# Run my DBSCAN implementation.
print('Running my implementation...')
my_labels = dbscan(X, eps=0.3, MinPts=10)

###############################################################################
# Scikit-learn implementation of DBSCAN
#

print('Runing scikit-learn implementation...')
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
skl_labels = db.labels_

# Scikit learn uses -1 to for NOISE, and starts cluster labeling at 0. I start
# numbering at 1, so increment the skl cluster numbers by 1.
for i in range(0, len(skl_labels)):
    if not skl_labels[i] == -1:
        skl_labels[i] += 1
Ejemplo n.º 43
0
for d in dataset:
    for i in xrange(0, cls_idx - 1):
        d.tuple[i] = float(d.tuple[i] - mins[i].tuple[i]) / (maxs[i].tuple[i] -
                                                             mins[i].tuple[i])

# print all combination of k and sd
for k in xrange(2, 5):
    for sd_away in xrange(0, 5):
        try:
            kdist = dbscan.k_distance(dataset, k, sd_away)
        except:
            print 'kdist anchor out of range, skipped'
            print
            continue

        cluster = dbscan.dbscan(dataset, kdist, k)
        if len(cluster) == 0:
            print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(
                cluster)
            print
            continue

        cc = [measure.cls_err(c) for c in cluster]
        errs = []
        clss = []
        for err, cls in cc:
            errs.append(err)
            clss.append(cls)
        cp = [len(c) for c in cluster]

        print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(
Ejemplo n.º 44
0
import dbscan
import sys

input_file = sys.argv[1]
cluster_num = int(sys.argv[2])
epsilon = int(sys.argv[3])
minpts = int(sys.argv[4])

point_list = dbscan.parser(input_file)
dbscan.get_near_point(point_list, epsilon, minpts)

mark_list = dbscan.dbscan(point_list, cluster_num, epsilon, minpts)
dbscan.post_clustering(cluster_num, mark_list, point_list)

for i in range(0, cluster_num):
    with open(input_file.split('.')[0] + '_cluster_' + str(i) + '.txt',
              'w') as writefile:
        writefile.seek(0)
        for index, num in enumerate(mark_list):
            if num is not None and num == i:
                writefile.write(str(index) + '\n')
Ejemplo n.º 45
0
def c_cluster_points(points, num_points, points_dist):
    return cdbscan.dbscan(points, num_points, points_dist) 
import dbscan

dbscan.dbscan( [(1,7), (3,4), (7, 8), (4, 5), (1, 1), (2, 2)], 5, 7.5 )





Ejemplo n.º 47
0
for d in dataset:
    for i in xrange(0, cls_idx-1):
        d.tuple[i] = float(d.tuple[i] - mins[i].tuple[i]) / (maxs[i].tuple[i] - mins[i].tuple[i])


# print all combination of k and sd
for k in xrange(2, 5):
    for sd_away in xrange(0, 5):
        try:
            kdist = dbscan.k_distance(dataset, k, sd_away)
        except:
            print 'kdist anchor out of range, skipped'
            print
            continue

        cluster = dbscan.dbscan(dataset, kdist, k)
        if len(cluster) == 0:
            print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(cluster)
            print
            continue

        cc = [measure.cls_err(c) for c in cluster]
        errs = []
        clss = []
        for err, cls in cc:
            errs.append(err)
            clss.append(cls)
        cp = [len(c) for c in cluster]

        print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(cluster)
        for i in xrange(0, len(cluster)):
Ejemplo n.º 48
0
reload(hp)

test_data = np.mat(hp.loadDataSet(file_name))

if plot_origin == 1:
    test_data_show = hp.degree2radian(test_data, 0)
    plt.createOrigin(test_data_show.A)
else:
    # get and transfer Data
    test_data_radian = hp.degree2radian(test_data, -1)
    test_data_vector = hp.orientation2vector(test_data_radian)

    # run dbscan Algorithm
    print("******************* start dbscan ********************")
    print("")
    cluster_result, noise_result, k = dbs.dbscan(test_data_vector, eps,
                                                 min_pts)

    # transfer result
    result_data = np.mat(np.zeros((cluster_result.shape[0], 3)))
    result_data[:, 0:2] = hp.vector2orientation(cluster_result[:, 0:3])
    result_data[:, -1] = cluster_result[:, -1]
    result_data = hp.degree2radian(result_data, 0)
    noise_data = hp.vector2orientation(noise_result[:, 0:3])
    noise_data = hp.degree2radian(noise_data, 0)

    # print ang plot result
    print("****************** cluster result ******************")
    print("")
    print(result_data)
    print("")
    print("****************** cluster result ******************")