コード例 #1
0
ファイル: tests.py プロジェクト: kg7155/DataMining
    def test_dbscan(self):
        dbs = dbscan.DBSCAN(eps=0.4, min_samples=5)
        clusters = dbs.fit_predict(self.X)
        self.assertEqual(len(np.unique(clusters[clusters >= 0])), 2)

        dbs = dbscan.DBSCAN(eps=0.2)
        clusters = dbs.fit_predict(self.X)
        self.assertGreater(len(np.unique(clusters[clusters >= 0])), 3)

        self.assertGreater(np.sum(clusters < 0), 3)
コード例 #2
0
def process_aggregation(subj, filename, clas, boxes, tips, notches):
    # scale a suitable eps for clustering the boxes from the median box width
    w = []
    for idn in range(0, len(boxes)):
        w.append(boxes[idn][2])
    if len(w) > 0:
        typ_width = median(w)
    else:
        typ_width = 60

    # scale a suitable min_points from the number of classifications
    min_point = max(int(clas * .405), 2)

    # cluster the boxes centroids
    epb = max(typ_width * .20, 20)
    scanb = dbscan.DBSCAN(epb, min_point)
    scanb.cluster(boxes)
    sorted_boxes = sorted(scanb.points,
                          key=operator.itemgetter(0),
                          reverse=True)
    bc_p = json.dumps(sorted_boxes)
    bclusters = json.dumps(scanb.clusters)

    # cluster the tip and notch points
    ept = max(typ_width * .15, 20)
    scant = dbscan.DBSCAN(ept, min_point)
    scant.cluster(tips)
    sorted_tips = sorted(scant.points, key=operator.itemgetter(1))
    tc_p = json.dumps(sorted_tips)
    tclusters = json.dumps(scant.clusters)
    scann = dbscan.DBSCAN(ept, min_point)
    scann.cluster(notches)
    nc_p = json.dumps(scann.points)
    nclusters = json.dumps(scann.clusters)

    # clean up the clusters to settle on boxes with exactly two enclosed tip points
    fluke_positions = fluke_pos(subj, sorted_tips, sorted_boxes)

    # prepare a row for write the resolved fluke boxes and points to file
    new_row = {
        'subject_ids': subj,
        'filename': filename,
        'classifications': clas,
        'boxes': json.dumps(boxes),
        'box_clusters': bc_p,
        'bclusters': bclusters,
        'tips': json.dumps(tips),
        'tip_clusters': tc_p,
        'tclusters': tclusters,
        'notches': json.dumps(notches),
        'notch_clusters': nc_p,
        'nclusters': nclusters,
        'flukes': json.dumps(fluke_positions)
    }
    return new_row
コード例 #3
0
def process_aggregation(subj, image, clas, ep, min_point, h_palms, flowring,
                        leafles):
    if clas > 10:  # test for a minimum of 10 valid clssifications
        scanh = dbscan.DBSCAN(ep, min_point)
        scanh.cluster(h_palms)
        hc_p = json.dumps(scanh.points)
        count_h = len(scanh.points)
        hclusters = json.dumps(scanh.clusters)
        hnoise = json.dumps(scanh.noise)
        scanf = dbscan.DBSCAN(ep, min_point)
        scanf.cluster(flowring)
        fc_p = json.dumps(scanf.points)
        count_f = len(scanf.points)
        fclusters = json.dumps(scanf.clusters)
        fnoise = json.dumps(scanf.noise)
        scanl = dbscan.DBSCAN(ep, min_point)
        scanl.cluster(leafles)
        lc_p = json.dumps(scanl.points)
        count_l = len(scanl.points)
        lclusters = json.dumps(scanl.clusters)
        lnoise = json.dumps(scanl.noise)
        print(subject)
        new_row = {
            'subject_ids': subj,
            'image': image_number,
            'classifications': i,
            'Count_h_palms': count_h,
            'H_palm_clusters': hc_p,
            'Hclusters': hclusters,
            'Hnoise': hnoise,
            'Count_flowering': count_f,
            'flowering_clusters': fc_p,
            'fclusters': fclusters,
            'fnoise': fnoise,
            'Count_leafless': count_l,
            'leafless_clusters': lc_p,
            'lclusters': lclusters,
            'lnoise': lnoise
        }
        writer.writerow(new_row)
        return True
    else:
        return False
コード例 #4
0
ファイル: main.py プロジェクト: k-rajmani2k/DBSCAN
def main():
    FileName = 'DBSCAN_data.csv'
    eps = 5
    MinPts = 2
    print("\nImporting csv file", FileName, "...")
    D = dbscan.dataRead(FileName)
    print("IMPORTING COMPLETE\n")
    obj = dbscan.DBSCAN(D, eps, MinPts)
    print("The input dataset for our clustering is:")
    obj.displayDataset()
    print("Running DBSCAN clustering...")
    obj.runDBSCAN()
    print("CLUSTERING COMPLETE\n")
    print("The clusters are:")
    obj.displayClusters()
    print("The noise obtained after clustering are:")
    obj.displayNoise()
    ClusterList = obj.createClusterList()
    NoiseList = obj.createNoiseList()
    graphplot.plot(ClusterList, NoiseList)
コード例 #5
0
print(len(a))

print(ag.GetLabels(len(a)))

from sklearn import datasets
blobs, _ = datasets.make_blobs(n_samples=300, random_state=10)

ag = agnes.Agnes(3)
skk.fit(blobs)
print("sklearn clusters")
print(skk.labels_[:30])
ag.Fit(blobs)
print('agnes clusters')
print(ag.GetLabels(len(blobs)).astype(np.int)[:30])

# DBSCAN TESTING
sys.stderr.write("Importing DBSCAN\n")
import dbscan
sys.stderr.write("Importing Finished\n")
sys.stderr.write("Initting\n")
db = dbscan.DBSCAN(0.5, 5)
sys.stderr.write("Finished Init\n")
sys.stderr.write("Fitting\n")
db.Fit(a)
sys.stderr.write("Finished Fitting\n")
sys.stderr.write("Cluster Representation\n")
# ????
print(len(a))
print(db.GetLabels(len(a)))
コード例 #6
0
    for i in range(n):
        c = agnes.Agnes(3, 'wards')
        t0 = time.time()
        c.Fit(blobs)
        t1 = time.time()
        t_total += (t1-t0)
    aimpl = t_total/n
    

    t_total = 0.0
    for i in range(n):
        c = cluster.DBSCAN(1.5, 5)
        t0 = time.time()
        c.fit(blobs)
        t1 = time.time()
        t_total += (t1-t0)
    dref = t_total/n

    
    t_total = 0.0
    for i in range(n):
        c = dbscan.DBSCAN(1.5, 5)
        t0 = time.time()
        c.Fit(blobs)
        t1 = time.time()
        t_total += (t1-t0)
    dimpl = t_total/n

    print('{:<5d}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}'.format(num, kref, kimpl, aref, aimpl, dref, dimpl))

コード例 #7
0
    for p in range(0, len(D)):
        for q in range(0, len(D)):
            distance[p, q] = numpy.linalg.norm(D[p] - D[q])
    return numpy.sort(distance, axis=0)


# Reading and extracting data
data = pd.read_csv('cho.txt', header=None, sep='\t')
#data = pd.read_csv('iyer.txt', header=None, sep='\t')
#data = pd.read_csv('new_dataset_1.txt', header=None, sep='\t')
data = data.values
data_ground_truth = data[:, 1]
data_features = data[:, 2:]

# Determining eps
for i in range(
        3, 20, 1
):  # i - MinPts, we consider an representative range 3 to 20 all the time
    epsOpt(i - 1,
           data_features)  # obtain sorted distance plot by running epsOpt
# determing eps by taking the average of best eps for each MinPts from 3 to 20 by the plot,
# the best eps for each MinPts is the gap point

# Determining MinPts by iteration
# after determining eps by first running, subsititute eps in the following DBSCAN function and then run again
for j in range(3, 20, 1):
    data_id = dbscan.DBSCAN(data_features, 1.3, j)
    ARI = dbscan.adjusted_rand_score(data_ground_truth, data_id)
    print('The Rand Index of eps {} MinPts {} is {}'.format(1.3, j, ARI))
# choose the MinPts-eps pair with the largest rand index
コード例 #8
0
# as usual the module must be in the current directory or in the sys path.
import dbscan

# data is just a list of [x,y]
data = [(629.1, 187.4), [636.5, 73.7], [474.4, 300.0], [541.7, 476.9],
        [544.9, 471.6], [529.1, 494.8, 'label'], (533.8, 473.2),
        [508.0, 362.1, 'label'], [485.9, 246.3], [484.9, 251.6],
        [370.1, 253.7], [604.7, 271.6], (607.0, 288.4), [603.8, 297.9],
        [719.6, 333.7]]

# determine a suitable eps and min_points
eps = 30
min_points = 3
print('epsilon =', eps, '  min_points =', min_points)
# and plug into a module containing an instance of class DBSCAN:
scan = dbscan.DBSCAN(eps, min_points)

# pass the data to the cluster function
scan.cluster(data)

# all done! Get the clustered data back:
print('clusters found:', scan.clusters)
print('number_of_clusters =', (len(scan.clusters)))
print('noise ie points in no cluster:', scan.noise)

# to save it in a known format convert to json strings.
# note subtle changes in brackets and quotes will occur!
clusters = json.dumps(scan.clusters)
noise = json.dumps(scan.noise)
print('ready to write clusters =', clusters)
print('ready to write noise =', noise)