Python dbscan Examples, sklearn.cluster.dbscan_.dbscan Python Examples

Example #1

0

Show file

def test_dbscan_core_samples_toy():
    X = [[0], [2], [3], [4], [6], [8], [10]]
    n_samples = len(X)

    for algorithm in ['brute', 'kd_tree', 'ball_tree']:
        # Degenerate case: every sample is a core sample, either with its own
        # cluster or including other close core samples.
        core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
                                      min_samples=1)
        assert_array_equal(core_samples, np.arange(n_samples))
        assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])

        # With eps=1 and min_samples=2 only the 3 samples from the denser area
        # are core samples. All other points are isolated and considered noise.
        core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
                                      min_samples=2)
        assert_array_equal(core_samples, [1, 2, 3])
        assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])

        # Only the sample in the middle of the dense area is core. Its two
        # neighbors are edge samples. Remaining samples are noise.
        core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
                                      min_samples=3)
        assert_array_equal(core_samples, [2])
        assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])

        # It's no longer possible to extract core samples with eps=1:
        # everything is noise.
        core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
                                      min_samples=4)
        assert_array_equal(core_samples, [])
        assert_array_equal(labels, -np.ones(n_samples))

Example #2

0

Show file

File: test_dbscan.py Project: mjjohns1/catboost

def test_dbscan_sparse():
    core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X),
                                        eps=.8,
                                        min_samples=10)
    core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10)
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)

Example #3

0

Show file

File: test_dbscan.py Project: CamDavidsonPilon/scikit-learn

def test_dbscan_sparse():
    core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=.8,
                                        min_samples=10, random_state=0)
    core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10,
                                      random_state=0)
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)

Example #4

0

Show file

def test_boundaries():
    # ensure min_samples is inclusive of core point
    core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
    assert_in(0, core)
    # ensure eps is inclusive of circumference
    core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
    assert_in(0, core)
    core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
    assert_not_in(0, core)

Example #5

0

Show file

File: test_dbscan.py Project: tdurieux/BugSwarm-dissection

def test_dbscan_sparse_precomputed():
    D = pairwise_distances(X)
    core_sparse, labels_sparse = dbscan(sparse.lil_matrix(D),
                                        eps=.8,
                                        min_samples=10,
                                        metric='precomputed')
    core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10)
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)

Example #6

0

Show file

File: test_dbscan.py Project: perimosocordiae/scikit-learn

def test_dbscan_sparse_precomputed():
    D = pairwise_distances(X)
    nn = NearestNeighbors(radius=0.9).fit(X)
    D_sparse = nn.radius_neighbors_graph(mode="distance")
    # Ensure it is sparse not merely on diagonals:
    assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
    core_sparse, labels_sparse = dbscan(D_sparse, eps=0.8, min_samples=10, metric="precomputed")
    core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed")
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)

Example #7

0

Show file

def test_dbscan_input_not_modified(use_sparse, metric):
    # test that the input is not modified by dbscan
    X = np.random.RandomState(0).rand(10, 10)
    X = sparse.csr_matrix(X) if use_sparse else X
    X_copy = X.copy()
    dbscan(X, metric=metric)

    if use_sparse:
        assert_array_equal(X.toarray(), X_copy.toarray())
    else:
        assert_array_equal(X, X_copy)

Example #8

0

Show file

File: test_dbscan.py Project: Xsardas1000/Django-server

def test_dbscan_sparse_precomputed():
    D = pairwise_distances(X)
    nn = NearestNeighbors(radius=.9).fit(X)
    D_sparse = nn.radius_neighbors_graph(mode='distance')
    # Ensure it is sparse not merely on diagonals:
    assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
    core_sparse, labels_sparse = dbscan(D_sparse,
                                        eps=.8,
                                        min_samples=10,
                                        metric='precomputed')
    core_dense, labels_dense = dbscan(D, eps=.8, min_samples=10,
                                      metric='precomputed')
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)

Example #9

0

Show file

def test_dbscan_sparse_precomputed_different_eps():
    # test that precomputed neighbors graph is filtered if computed with
    # a radius larger than DBSCAN's eps.
    lower_eps = 0.2
    nn = NearestNeighbors(radius=lower_eps).fit(X)
    D_sparse = nn.radius_neighbors_graph(X, mode='distance')
    dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric='precomputed')

    higher_eps = lower_eps + 0.7
    nn = NearestNeighbors(radius=higher_eps).fit(X)
    D_sparse = nn.radius_neighbors_graph(X, mode='distance')
    dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric='precomputed')

    assert_array_equal(dbscan_lower[0], dbscan_higher[0])
    assert_array_equal(dbscan_lower[1], dbscan_higher[1])

Example #10

0

Show file

File: test_dbscan.py Project: perimosocordiae/scikit-learn

def test_dbscan_balltree():
    # Tests the DBSCAN algorithm with balltree for neighbor calculation.
    eps = 0.8
    min_samples = 10

    D = pairwise_distances(X)
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree")
    labels = db.fit(X).labels_

    n_clusters_3 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_3, n_clusters)

    db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
    labels = db.fit(X).labels_

    n_clusters_4 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_4, n_clusters)

    db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree")
    labels = db.fit(X).labels_

    n_clusters_5 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_5, n_clusters)

Example #11

0

Show file

def angle_predict(hits,m,i,rz_shift,eps,weights):
    aa = hits.a+m*(hits.r+0.000005*(hits.r**2))/1000*(i/2)/180*3.141
            
    hits['f0'] = np.sin(aa)
    hits['f1'] = np.cos(aa)
    
    hits_b = hits[hits.type == 'b']['hit_id']
    hits_c = hits[hits.type == 'c']['hit_id']

    ss = StandardScaler()
    X = ss.fit_transform(np.column_stack([hits.f0.values, hits.f1.values, hits.z1.values, hits.z2.values, hits.xr.values, hits.yr.values]))
    
    X_b = np.multiply(np.vstack([X[ex-1] for ex in hits_b.values]),weights[0])
    X_c = np.multiply(np.vstack([X[ex-1] for ex in hits_c.values]),weights[1])

    Xw = np.zeros(X.shape)

    Xw[hits_b.values-1] = X_b[range(len(hits_b.values))]
    Xw[hits_c.values-1] = X_c[range(len(hits_c.values))]

    eps = eps + (i*0.000005)

    _,labels = dbscan(Xw, eps=eps, min_samples=1, algorithm='auto', n_jobs=4)

    unique,reverse,count = np.unique(labels,return_counts=True,return_inverse=True)
    c = count[reverse]
    c[np.where(labels==0)]=0
    if abs(rz_shift) < 0.1:
        c[np.where(c>20)]=0
    else:
        c[np.where(c>8)]=0
    return (labels,c)

Example #12

0

Show file

def get_features(sub, cluster_size=10):
    """
    Input: dataframe with hits long tracks
    Output: array with features of long track
    """
    hitst = sub.copy()
    X = np.column_stack([
        hitst.x.values, hitst.y.values, hitst.z.values,
        hitst.track_id.values * 1000000
    ])
    _, hitst['labels'] = dbscan(X,
                                eps=cluster_size,
                                min_samples=1,
                                algorithm='ball_tree',
                                metric='euclidean')
    gp = hitst.groupby('track_id').agg({
        'hit_id': 'count',
        'labels': 'nunique',
        'volume_id': 'min',
        'x': ['min', 'max', 'var'],
        'y': ['min', 'max', 'var'],
        'z': ['min', 'max', 'var', 'mean']
    })

    gp.columns = ["".join(t) for t in gp.columns.ravel()]
    gp = gp.rename(
        columns={
            'hit_idcount': 'nhits',
            'labelsnunique': 'nclusters',
            'volume_idmin': 'svolume'
        }).reset_index()
    gp['nhitspercluster'] = gp.nhits / gp.nclusters
    return gp

Example #13

0

Show file

File: test_dbscan.py Project: mjjohns1/catboost

def test_dbscan_callable():
    # Tests the DBSCAN algorithm with a callable metric.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    # metric is the function reference, not the string key.
    metric = distance.euclidean
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X,
                                  metric=metric,
                                  eps=eps,
                                  min_samples=min_samples,
                                  algorithm='ball_tree')

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric=metric,
                eps=eps,
                min_samples=min_samples,
                algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

Example #14

0

Show file

    def predict(self, dfh):
        print("len(dfh) : {0}".format(len(dfh)))

        if ("rt" not in dfh.columns):
            dfh["rt"] = np.sqrt(dfh['x'].values**2 + dfh['y'].values**2)

        z = dfh['z'].values
        rt = dfh["rt"].values
        r = np.sqrt(dfh['x']**2 + dfh['y']**2 + dfh['z']**2)
        a0 = np.arctan2(dfh['y'].values, dfh['x'].values)
        layer_id = dfh['layer_id'].values.astype(np.float32)

        sys.stderr.write("dbscan for each parameters\n")
        scan_labels_list = []
        for (dj, di) in tqdm(product(self.djs, self.dis),
                             total=len(self.djs) * len(self.dis)):
            ar = a0 + di * rt
            zr = (z + dj) / rt * 0.1
            if (self.param_type == 0):
                params = [ar, zr]
            elif (self.param_type == 1):
                params = [np.sin(ar), np.cos(ar), zr * 10, 1 / (10 * zr)]
            elif (self.param_type == 2):
                params = [np.sin(ar), np.cos(ar), zr]
            else:
                raise RuntimeError("invalid param_type.")

            if (self.weight is None):
                w = np.array([1.0 for _ in params])
            else:
                w = np.array(self.weight)

            ss = StandardScaler()
            data1 = ss.fit_transform(np.column_stack(params))
            data2 = w[np.newaxis, :] * data1

            _, scan_label = dbscan(
                data2,
                eps=self.eps,
                min_samples=1,
            )
            scan_labels_list.append(scan_label)

        sys.stderr.write("clustering\n")
        dfh["s1"] = dfh.hit_id
        dfh["N1"] = 1
        for scan_labels in scan_labels_list:
            dfh["s2"] = scan_labels
            dfh["N2"] = dfh.groupby('s2')['s2'].transform('count')
            maxs1 = np.max(dfh.s1)
            dfh.s1 = np.where((dfh.N2 > dfh.N1) & (dfh.N2 < 20),
                              dfh.s2 + maxs1, dfh.s1)
            dfh['s1'] = dfh['s1'].astype('int64')
            dfh['N1'] = dfh.groupby('s1')['s1'].transform('count')
        labels = dfh['s1']

        return labels

Example #15

0

Show file

    def predict(self, hits, weights):
        x = hits.x.values
        y = hits.y.values
        z = self.rz_scale * hits.z.values

        r = np.sqrt(x**2 + y**2)
        d = np.sqrt(x**2 + y**2 + z**2)
        a = np.arctan2(y, x)
        zr = z / r
        dr = d / r
        hits['d'] = d

        w0, w1, w2, w3, w4 = weights

        ss = StandardScaler()

        results = []
        dzi = -0.00010
        for step in [11]:  #range(21): #0.00060/121/-60
            dz = dzi + (step * 0.00001)
            f0 = w0 * (a + (dz * z * np.sign(z)))
            f1 = w1 * (zr)
            f2 = w2 * (f0 / zr)
            f3 = w3 * (1 / zr)
            f4 = w4 * (f2 + f3)

            X = ss.fit_transform(np.column_stack([f0, f1, f2, f3, f4]))

            eps = self.eps - (abs(step - 10) * 0.000015)

            _, labels = dbscan(X,
                               eps=eps,
                               min_samples=1,
                               algorithm='auto',
                               n_jobs=4)

            unique, reverse, count = np.unique(labels,
                                               return_counts=True,
                                               return_inverse=True)
            c = count[reverse]
            c[np.where(labels == 0)] = 0
            c[np.where(c > 20)] = 0
            results.append((labels, c))

        labels, counts = results[0]

        for i in range(1, len(results)):
            l, c = results[i]
            idx = np.where((c - counts > 0))[0]
            labels[idx] = l[idx] + labels.max()
            counts[idx] = c[idx]

        return labels

Example #16

0

Show file

File: helices_baseline.py Project: escuccim/trackml_code

def find_labels(params):
    hits, dz = params
    a = hits['phi'].values
    z = hits['z'].values
    zr = hits['zr'].values
    aa = a + np.sign(z) * dz * z

    f0 = np.cos(aa)
    f1 = np.sin(aa)
    f2 = zr
    X = StandardScaler().fit_transform(np.column_stack([f0, f1, f2]))

    _, l = dbscan(X, eps=0.0045, min_samples=1, n_jobs=4)
    return l + 1

Example #17

0

Show file

File: patching.py Project: MatthewMasters/TrackML

def seed_tracks(event_id, df, start_d, end_d, ax):
    seed = df.loc[df.d > start_d]
    seed = seed.loc[seed.d < end_d]
    N = len(seed)

    p = seed[['particle_id']].values.astype(np.int64)
    x, y, z, r, a, cosa, sina, phi = seed[[
        'x', 'y', 'z', 'r', 'a', 'cosa', 'sina', 'phi'
    ]].values.astype(np.float32).T

    particle_ids = np.unique(p)
    particle_ids = particle_ids[particle_ids != 0]
    num_particle_ids = len(particle_ids)

    # do dbscan here =======================================
    data = np.column_stack([a, z / r * 0.1])

    _, l = dbscan(
        data,
        eps=0.01,
        min_samples=1,
    )

    #print(len(truth))
    #print(len(seed))
    #print(len(submission))
    #print(len(l))

    seed['l'] = pd.Series(l, index=seed.index)
    #print(seed)
    submission = pd.DataFrame(columns=['event_id', 'hit_id', 'track_id'],
                              data=np.column_stack(([
                                  int(event_id),
                              ] * len(seed), seed.hit_id.values,
                                                    l))).astype(int)

    score = score_event_fast(seed, submission)
    print(score)

    predicted_tracks, counts = np.unique(l, return_counts=True)
    predicted_tracks = predicted_tracks[counts > 1]

    for predicted_track in predicted_tracks[::100]:
        track_hits = seed[seed.l == predicted_track]
        ax.plot(xs=track_hits.a, ys=track_hits.r, zs=track_hits.z)

Example #18

0

Show file

File: partitioning.py Project: lucianamaroun/probabilistic-ranking

def get_base_partitioning(distance_matrix, eps=0.5, min_samples=2):
  """ Gets the base partitioning from the distance matrix using DBScan
    algorithm.

  Args:
    distance_matrix: a list of lists with the distances of references.

  Returns:
    A list of integers from 0 to k - 1, each one representing a block for the
      reference represented by the index.
  """
  labels = dbscan(np.array(distance_matrix), metric='precomputed', eps=eps, 
      min_samples=min_samples)
  next_label = max(labels[1]) + 1
  for i in range(len(labels[1])):
    if labels[1][i] == -1:
      labels[1][i] = next_label
      next_label += 1
  return labels[1].tolist(), number_of_clusters(labels[1])

Example #19

0

Show file

File: test_dbscan.py Project: perimosocordiae/scikit-learn

def test_dbscan_similarity():
    # Tests the DBSCAN algorithm with a similarity array.
    # Parameters chosen specifically for this task.
    eps = 0.15
    min_samples = 10
    # Compute similarities
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)
    # Compute DBSCAN
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)

    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
    labels = db.fit(D).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

Example #20

0

Show file

File: test_dbscan.py Project: perimosocordiae/scikit-learn

def test_dbscan_feature():
    # Tests the DBSCAN algorithm with a feature vector array.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    metric = "euclidean"
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

Example #21

0

Show file

File: test_dbscan.py Project: mjjohns1/catboost

def test_dbscan_balltree():
    # Tests the DBSCAN algorithm with balltree for neighbor calculation.
    eps = 0.8
    min_samples = 10

    D = pairwise_distances(X)
    core_samples, labels = dbscan(D,
                                  metric="precomputed",
                                  eps=eps,
                                  min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree')
    labels = db.fit(X).labels_

    n_clusters_3 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_3, n_clusters)

    db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_4 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_4, n_clusters)

    db = DBSCAN(leaf_size=20,
                eps=eps,
                min_samples=min_samples,
                algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_5 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_5, n_clusters)

Example #22

0

Show file

File: test_dbscan.py Project: perimosocordiae/scikit-learn

def test_dbscan_callable():
    # Tests the DBSCAN algorithm with a callable metric.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    # metric is the function reference, not the string key.
    metric = distance.euclidean
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree")

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree")
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

Example #23

0

Show file

File: partitioning.py Project: lucianamaroun/probabilistic-ranking

def get_base_partitioning(distance_matrix, eps=0.5, min_samples=2):
    """ Gets the base partitioning from the distance matrix using DBScan
    algorithm.

  Args:
    distance_matrix: a list of lists with the distances of references.

  Returns:
    A list of integers from 0 to k - 1, each one representing a block for the
      reference represented by the index.
  """
    labels = dbscan(np.array(distance_matrix),
                    metric='precomputed',
                    eps=eps,
                    min_samples=min_samples)
    next_label = max(labels[1]) + 1
    for i in range(len(labels[1])):
        if labels[1][i] == -1:
            labels[1][i] = next_label
            next_label += 1
    return labels[1].tolist(), number_of_clusters(labels[1])

Example #24

0

Show file

def test_dbscan_feature():
    # Tests the DBSCAN algorithm with a feature vector array.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    metric = 'euclidean'
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X, metric=metric, eps=eps,
                                  min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

Example #25

0

Show file

def test_dbscan_similarity():
    # Tests the DBSCAN algorithm with a similarity array.
    # Parameters chosen specifically for this task.
    eps = 0.15
    min_samples = 10
    # Compute similarities
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)
    # Compute DBSCAN
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
                                  min_samples=min_samples)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)

    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
    labels = db.fit(D).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

Example #26

0

Show file

File: img_grouping.py Project: Terminator8758/CAP-master

def img_association(network, propagate_loader, min_sample=4, eps=0,
                    rerank=False, k1=20, k2=6, intra_id_reinitialize=False):

    network.eval()
    print('Start Inference...')
    features = []
    global_labels = []
    all_cams = []

    with torch.no_grad():
        for c, data in enumerate(propagate_loader):
            images = data[0]
            g_label = data[3]
            cam = data[4]

            embed_feat = network(images)
            features.append(embed_feat.cpu())

            global_labels.append(g_label)
            all_cams.append(cam)

    features = torch.cat(features, dim=0).numpy()
    global_labels = torch.cat(global_labels, dim=0).numpy()
    all_cams = torch.cat(all_cams, dim=0).numpy()
    print('  features: shape= {}'.format(features.shape))

    # if needed, average camera-style transferred image features
    new_features = []
    new_cams = []
    for glab in np.unique(global_labels):
        idx = np.where(global_labels == glab)[0]
        new_features.append(np.mean(features[idx], axis=0))
        new_cams.append(all_cams[idx])

    new_features = np.array(new_features)
    new_cams = np.array(new_cams).squeeze()
    del features, all_cams

    # compute distance W
    new_features = new_features / np.linalg.norm(new_features, axis=1, keepdims=True)  # l2-normalize
    if rerank:
        W = faiss_compute_jaccard_dist(torch.from_numpy(new_features), k1=k1, k2=k2)
    else:
        W = cdist(new_features, new_features, 'euclidean')
    print('  distance matrix: shape= {}'.format(W.shape))

    # self-similarity for association
    print('  perform image grouping...')
    _, updated_label = dbscan(W, eps=eps, min_samples=min_sample, metric='precomputed', n_jobs=8)
    print('  eps in cluster: {:.3f}'.format(eps))
    print('  updated_label: num_class= {}, {}/{} images are associated.'
          .format(updated_label.max() + 1, len(updated_label[updated_label >= 0]), len(updated_label)))

    if intra_id_reinitialize:
        print('re-computing initialized intra-ID feature...')
        intra_id_features = []
        intra_id_labels = []
        for cc in np.unique(new_cams):
            percam_ind = np.where(new_cams == cc)[0]
            percam_feature = new_features[percam_ind, :]
            percam_label = updated_label[percam_ind]
            percam_class_num = len(np.unique(percam_label[percam_label >= 0]))
            percam_id_feature = np.zeros((percam_class_num, percam_feature.shape[1]), dtype=np.float32)
            cnt = 0
            for lbl in np.unique(percam_label):
                if lbl >= 0:
                    ind = np.where(percam_label == lbl)[0]
                    id_feat = np.mean(percam_feature[ind], axis=0)
                    percam_id_feature[cnt, :] = id_feat
                    intra_id_labels.append(lbl)
                    cnt += 1
            percam_id_feature = percam_id_feature / np.linalg.norm(percam_id_feature, axis=1, keepdims=True)
            intra_id_features.append(torch.from_numpy(percam_id_feature))
        return updated_label, intra_id_features

Example #27

0

Show file

def copac(X,
          k=10,
          mu=5,
          eps=0.5,
          alpha=0.85,
          metric='euclidean',
          metric_params=None,
          algorithm='auto',
          leaf_size=30,
          p=None,
          n_jobs=1,
          sample_weight=None):
    """Perform COPAC clustering from vector array.

    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        A feature array.
    k : int, optional, default=10
        Size of local neighborhood for local correlation dimensionality.
        The paper suggests k >= 3 * n_features.
    mu : int, optional, default=5
        Minimum number of points in a copac with mu <= k.
    eps : float, optional, default=0.5
        Neighborhood predicate, so that neighbors are closer than `eps`.
    alpha : float in ]0,1[, optional, default=0.85
        Threshold of how much variance needs to be explained by Eigenvalues.
        Assumed to be robust in range 0.8 <= alpha <= 0.9 [see Ref.]
    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by sklearn.metrics.pairwise.pairwise_distances
        for its metric parameter.
        If metric is "precomputed", `X` is assumed to be a distance matrix and
        must be square.
    metric_params : dict, optional
        Additional keyword arguments for the metric function.
    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        The algorithm to be used by the scikit-learn NearestNeighbors module
        to compute pointwise distances and find nearest neighbors.
        See NearestNeighbors module documentation for details.
    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or cKDTree. This can affect the speed
        of the construction and query, as well as the memory required
        to store the tree. The optimal value depends
        on the nature of the problem.
    p : float, optional
        The power of the Minkowski metric to be used to calculate distance
        between points.
    n_jobs : int, optional, default=1
        Number of parallel processes. Use all cores with n_jobs=-1.
    sample_weight : None
        Currently ignored

    Returns
    -------
    labels : array [n_samples]
        Cluster labels for each point. Noisy samples are given the label -1.

    References
    ----------
    Elke Achtert, Christian Bohm, Hans-Peter Kriegel, Peer Kroger,
    A. Z. (n.d.). Robust, complete, and efficient correlation
    clustering. In Proceedings of the Seventh SIAM International
    Conference on Data Mining, April 26-28, 2007, Minneapolis,
    Minnesota, USA (2007), pp. 413–418.
    """
    X = check_array(X)
    n, d = X.shape
    y = -np.ones(n, dtype=np.int)
    if n_jobs == -1:
        n_jobs = cpu_count()

    # Calculating M^ just once requires more memory, but saves computation
    lambda_ = np.zeros(n, dtype=int)
    M_hat = list()

    # Get nearest neighbors
    nn = NearestNeighbors(n_neighbors=k,
                          metric=metric,
                          algorithm=algorithm,
                          leaf_size=leaf_size,
                          metric_params=metric_params,
                          p=p,
                          n_jobs=n_jobs)
    nn.fit(X)
    knns = nn.kneighbors(return_distance=False)
    for P, knn in enumerate(knns):
        N_P = X[knn]

        # Correlation copac covariance matrix
        Sigma = np.cov(N_P[:, :], rowvar=False, ddof=0)

        # Decompose spsd matrix, and sort Eigenvalues descending
        E, V = LA.eigh(Sigma)
        E = np.sort(E)[::-1]

        # Local correlation dimension
        explanation_portion = np.cumsum(E) / E.sum()
        lambda_P = np.searchsorted(explanation_portion, alpha, side='left')
        lambda_P += 1
        lambda_[P] = lambda_P
        # Correlation distance matrix
        E_hat = (np.arange(1, d + 1) > lambda_P).astype(int)
        M_hat.append(V @ np.diag(E_hat) @ V.T)

    # Group points by corr. dim.
    argsorted = np.argsort(lambda_)
    edges, _ = np.histogram(lambda_[argsorted], bins=np.arange(1, d + 2))
    Ds = np.split(argsorted, np.cumsum(edges))
    # Loop over partitions according to local corr. dim.
    max_label = 0
    used_y = np.zeros_like(y, dtype=int)
    for D in Ds:
        n_D = D.shape[0]
        cdist_P = -np.ones(n_D * (n_D - 1) // 2, dtype=np.float)
        cdist_Q = -np.ones((n_D, n_D), dtype=np.float)
        start = 0
        # Calculate triu part of distance matrix
        for i in range(0, n_D - 1):
            p = D[i]
            # Vectorized inner loop
            q = D[i + 1:n_D]
            stop = start + n_D - i - 1
            cdist_P[start:stop] = _cdist(X[p], X[q], M_hat[p])
            start = stop
        # Calculate tril part of distance matrix
        for i in range(1, n_D):
            q = D[i]
            p = D[0:i]
            cdist_Q[i, :i] = _cdist(X[q], X[p], M_hat[q])
        # Extract tril to 1D array
        # TODO simplify...
        cdist_Q = cdist_Q.T[np.triu_indices_from(cdist_Q, k=1)]
        cdist = np.block([[cdist_P], [cdist_Q]])
        # Square root of the higher value of cdist_P, cdist_Q
        cdist = np.sqrt(cdist.max(axis=0))

        # Perform DBSCAN with full distance matrix
        cdist = squareform(cdist)
        clust = dbscan(X=cdist,
                       eps=eps,
                       min_samples=mu,
                       metric='precomputed',
                       n_jobs=n_jobs)
        _, labels = clust
        # Each DBSCAN run is unaware of previous ones,
        # so we need to keep track of previous copac IDs
        y_D = labels + max_label
        new_labels = np.unique(labels[labels >= 0]).size
        max_label += new_labels
        # Set copac labels in `y`
        y[D] = y_D
        used_y[D] += 1
    assert np.all(used_y == 1), "Not all samples were handled exactly once!"
    return y

Example #28

0

Show file

def test_weighted_dbscan():
    # ensure sample_weight is validated
    assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2])
    assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4])

    # ensure sample_weight has an effect
    assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
                                  min_samples=6)[0])
    assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
                                  min_samples=6)[0])
    assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
                                   min_samples=6)[0])
    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
                                      min_samples=6)[0])

    # points within eps of each other:
    assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
                                      sample_weight=[5, 1], min_samples=6)[0])
    # and effect of non-positive and non-integer sample_weight:
    assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
                                  eps=1.5, min_samples=6)[0])
    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1],
                                      eps=1.5, min_samples=6)[0])
    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
                                      eps=1.5, min_samples=6)[0])
    assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
                                  eps=1.5, min_samples=6)[0])

    # for non-negative sample_weight, cores should be identical to repetition
    rng = np.random.RandomState(42)
    sample_weight = rng.randint(0, 5, X.shape[0])
    core1, label1 = dbscan(X, sample_weight=sample_weight)
    assert_equal(len(label1), len(X))

    X_repeated = np.repeat(X, sample_weight, axis=0)
    core_repeated, label_repeated = dbscan(X_repeated)
    core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
    core_repeated_mask[core_repeated] = True
    core_mask = np.zeros(X.shape[0], dtype=bool)
    core_mask[core1] = True
    assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)

    # sample_weight should work with precomputed distance matrix
    D = pairwise_distances(X)
    core3, label3 = dbscan(D, sample_weight=sample_weight,
                           metric='precomputed')
    assert_array_equal(core1, core3)
    assert_array_equal(label1, label3)

    # sample_weight should work with estimator
    est = DBSCAN().fit(X, sample_weight=sample_weight)
    core4 = est.core_sample_indices_
    label4 = est.labels_
    assert_array_equal(core1, core4)
    assert_array_equal(label1, label4)

    est = DBSCAN()
    label5 = est.fit_predict(X, sample_weight=sample_weight)
    core5 = est.core_sample_indices_
    assert_array_equal(core1, core5)
    assert_array_equal(label1, label5)
    assert_array_equal(label1, est.labels_)

Example #29

0

Show file

def study_dbscan_for_tracklet_seeding():

    ## load an event ---
    event_id = '000001029'

    data_dir = '/root/share/project/kaggle/cern/data/__download__/train_100_events'
    #detectors = pd.read_csv('/root/share/project/kaggle/cern/data/__download__/detectors.csv')
    particles = pd.read_csv(data_dir + '/event%s-particles.csv' % event_id)
    hits = pd.read_csv(data_dir + '/event%s-hits.csv' % event_id)
    truth = pd.read_csv(data_dir + '/event%s-truth.csv' % event_id)
    #cells = pd.read_csv(data_dir + '/event%s-cells.csv'%event_id)

    truth = truth.merge(hits, on=['hit_id'], how='left')
    truth = truth.merge(particles, on=['particle_id'], how='left')

    #--------------------------------------------------------
    df = truth.copy()
    df = df.assign(r=np.sqrt(df.x**2 + df.y**2))
    df = df.assign(d=np.sqrt(df.x**2 + df.y**2 + df.z**2))
    df = df.assign(a=np.arctan2(df.y, df.x))
    df = df.assign(cosa=np.cos(df.a))
    df = df.assign(sina=np.sin(df.a))
    df = df.assign(phi=np.arctan2(df.z, df.r))
    df = df.assign(momentum=np.sqrt(df.px**2 + df.py**2 + df.pz**2))
    df.loc[df.particle_id == 0, 'momentum'] = 0

    #df = df.loc[df.z>500] # consider dataset subset
    #df = df.loc[df.r<50 ] ## 0.04397/0.04750  (0.92569)

    df = df.loc[df.z > 500]
    df = df.loc[(df.r > 50) & (df.r < 100)]  ## 0.05259/0.05808  (0.90551)

    #df = df.loc[df.z>500]
    #df = df.loc[df.r<100] ## 0.09417/0.10557  (0.89195)

    #df = df.loc[(df.a>0) & (df.a<0.5)]
    #df = df.loc[(df.a>0) & (df.a<1)]

    # df = df.loc[df.z>500] # consider dataset subset
    # df = df.loc[(df.r>50) & (df.r<100)]

    #df = df.loc[(df.z>0) &(df.z<500)]
    #df = df.loc[df.r<200 ]
    #df = df.loc[(df.a>0) & (df.a<0.5)]
    #df = df.loc[(df.z>df.r)]
    #df = df.loc[(df.r>50) & (df.r<100) ]

    #-------------------------------------------------------
    N = len(df)

    layer_id = df['layer_id'].values.astype(np.float32)
    momentum = df['momentum'].values.astype(np.float32)
    p = df[['particle_id']].values.astype(np.int64)
    x, y, z, r, a, cosa, sina, phi = df[[
        'x', 'y', 'z', 'r', 'a', 'cosa', 'sina', 'phi'
    ]].values.astype(np.float32).T

    particle_ids = np.unique(p)
    particle_ids = particle_ids[particle_ids != 0]
    num_particle_ids = len(particle_ids)

    # do xxx =======================================
    #color = plt.cm.hsv( (z-z.min()) / (z.max()-z.min()))
    color = plt.cm.hsv(
        (layer_id - layer_id.min()) / (layer_id.max() + 1 - layer_id.min()))

    plot3d_particles(ax3d1, particle_ids, p, a, r, z, z)
    ax3d1.scatter(a, r, z, c=color, s=64, edgecolors='none')
    #plt.show()

    dj = 0
    di = 0
    EPS = 1e-12
    #if 1:

    candidates = []
    for dj in np.arange(-20, 20 + EPS, 10):
        for di in np.arange(-0.003, 0.003 + EPS, 0.00025):
            ar = a + di * r
            zr = (z + dj) / r * 0.1
            data2 = np.column_stack([ar, zr])

            _, l = dbscan(
                data2,
                eps=0.0025,
                min_samples=1,
            )
            track_ids = np.unique(l)
            track_ids = track_ids[track_ids != 0]
            neigbour = [np.where(l == t)[0] for t in track_ids]

            unique, inverse, c = np.unique(l,
                                           return_counts=True,
                                           return_inverse=True)
            unique = unique[unique != 0]
            c = c[inverse]
            c[l == 0] = 0

            for u in unique:
                candidate = np.where(l == u)[0]
                candidates.append(candidate)

    #---
    #<todo>
    #fix angle discontinunity problem here ...

    #-----
    #sort
    count = np.array([len(candidate) for candidate in candidates])
    sort = np.argsort(-count)
    candidates = [candidates[s] for s in sort]

    #show
    max_label = 1
    label = np.zeros(N, np.int32)
    count = np.zeros(N, np.int32)

    for candidate in candidates:
        n = candidate
        L = len(n)
        #print(L)

        #---- filtering (secret sauce) ----------
        #if L<3: continue
        n = n[np.argsort(np.fabs(z[n]))]

        layer_id0 = layer_id[n[:-1]]
        layer_id1 = layer_id[n[1:]]
        ld = layer_id1 - layer_id0
        if np.any(ld > 2): continue

        m = count[n].max()
        if L < m: continue

        #---- filtering ----------------------

        count[n] = L
        label[n] = max_label
        max_label += 1

        ## show:
        if L >= 3:
            #c = np.random.uniform(0,1,3)#[0,0,0]
            c = [0, 0, 0]

            #ax3d1.clear()
            #plot_particles(ax3d1, particle_ids, p, a,r,zr, z)
            #ax3d1.scatter(ar, r,  zr, c=color, s=64, edgecolors='none')
            ax3d1.plot(a[n],
                       r[n],
                       z[n],
                       '.-',
                       color=c,
                       markersize=5,
                       linewidth=1)
            #ax3d1.plot(a[[n[0],n[-1]]],r[[n[0],n[-1]]],zr[[n[0],n[-1]]],'-',  color=[1,0,0], markersize=5,  linewidth=1)

        #plt.pause(0.01)
        #plt.waitforbuttonpress(-1)
        #plt.show()

    ##-###################################################################################3
    submission = pd.DataFrame(columns=['event_id', 'hit_id', 'track_id'],
                              data=np.column_stack(([
                                  int(event_id),
                              ] * len(df), df.hit_id.values,
                                                    label))).astype(int)
    score1 = score_event(df, submission)
    score2, results = cpmp_fast_score(df, submission)

    #print results
    max_score = df.weight.sum()
    print('max_score = df.weight.sum() = %0.5f' % max_score)
    print('score1= %0.5f  (%0.5f)' % (score1 * max_score, score1))
    print('score2= %0.5f  (%0.5f)' % (score2, score2 / max_score))

    plt.show()
    print('end')
    exit(0)

Example #30

0

Show file

File: seeding.py Project: MatthewMasters/TrackML

def study_dbscan_for_tracklet_seeding():

    ## load an event ---
    event_id = '000001029'

    path_to_train = "data/train_1"
    particles = pd.read_csv(path_to_train +
                            '/event%s-particles.csv' % event_id)
    hits = pd.read_csv(path_to_train + '/event%s-hits.csv' % event_id)
    truth = pd.read_csv(path_to_train + '/event%s-truth.csv' % event_id)
    #cells = pd.read_csv(path_to_train + '/event%s-cells.csv'%event_id)

    truth = truth.merge(hits, on=['hit_id'], how='left')
    truth = truth.merge(particles, on=['particle_id'], how='left')

    #--------------------------------------------------------
    df = truth.copy()
    df = df.assign(r=np.sqrt(df.x**2 + df.y**2))
    df = df.assign(d=np.sqrt(df.x**2 + df.y**2 + df.z**2))
    df = df.assign(a=np.arctan2(df.y, df.x))
    df = df.assign(cosa=np.cos(df.a))
    df = df.assign(sina=np.sin(df.a))
    df = df.assign(phi=np.arctan2(df.z, df.r))
    df = df.assign(momentum=np.sqrt(df.px**2 + df.py**2 + df.pz**2))
    df.loc[df.particle_id == 0, 'momentum'] = 0

    df = df.loc[df.z > 500]  # consider dataset subset
    df = df.loc[df.r < 50]
    N = len(df)

    #-------------------------------------------------------
    momentum = df[['momentum']].values.astype(np.float32)
    p = df[['particle_id']].values.astype(np.int64)
    x, y, z, r, a, cosa, sina, phi = df[[
        'x', 'y', 'z', 'r', 'a', 'cosa', 'sina', 'phi'
    ]].values.astype(np.float32).T

    particle_ids = np.unique(p)
    particle_ids = particle_ids[particle_ids != 0]
    num_particle_ids = len(particle_ids)

    # do dbscan here =======================================
    data = np.column_stack([a, z / r * 0.1])

    _, l = dbscan(
        data,
        eps=0.01,
        min_samples=1,
    )

    submission = pd.DataFrame(columns=['event_id', 'hit_id', 'track_id'],
                              data=np.column_stack(([
                                  int(event_id),
                              ] * len(df), df.hit_id.values, l))).astype(int)
    #score1 = score_event(df, submission)
    #print(df)
    #print(submission)
    score2, results = cpmp_fast_score(df, submission)

    #print results
    #max_score = df.weight.sum()
    #print('max_score = df.weight.sum() = %0.5f'%max_score)
    #print('score1= %0.5f  (%0.5f)'%(score1*max_score,score1))
    #print('score2= %0.5f  (%0.5f)'%(score2,score2/max_score))

    ## analyse the results here =============================
    d0, d1 = data.T
    track_ids = np.unique(l)
    track_ids = track_ids[track_ids != 0]
    num_track_ids = len(track_ids)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    fig.patch.set_facecolor('white')

    fig1 = plt.figure(figsize=(8, 8))
    ax1 = fig1.add_subplot(111)
    ax1 = Axes3D(fig1)
    fig1.patch.set_facecolor('white')

    def show_ax():
        ax1.set_xlabel('a', fontsize=16)
        ax1.set_ylabel('r', fontsize=16)
        ax1.set_zlabel('z', fontsize=16)
        ax.set_xlabel('a', fontsize=16)
        ax.set_ylabel('z/r', fontsize=16)
        # ax.grid()
        # ax.set_aspect('equal', 'box')

        plt.show()

    ## 0. show data:
    if False:
        ax.clear()
        ax1.clear()
        ax.plot(d0,
                d1,
                '.',
                color=[0.75, 0.75, 0.75],
                markersize=3,
                linewidth=0)
        ax1.plot(a,
                 r,
                 z,
                 '.',
                 color=[0.75, 0.75, 0.75],
                 markersize=3,
                 linewidth=0)
        show_ax()

    ## 1. show GT:
    if True:

        ax.clear()
        ax1.clear()
        ax.plot(d0,
                d1,
                '.',
                color=[0.75, 0.75, 0.75],
                markersize=3,
                linewidth=0)
        ax1.plot(a,
                 r,
                 z,
                 '.',
                 color=[0.75, 0.75, 0.75],
                 markersize=3,
                 linewidth=0)

        ax.set_title('Ground truth')
        ax1.set_title('Ground truth')

        ax1.set_xlabel('a', fontsize=16)
        ax1.set_ylabel('r', fontsize=16)
        ax1.set_zlabel('z', fontsize=16)
        ax.set_xlabel('a', fontsize=16)
        ax.set_ylabel('z/r', fontsize=16)

        for n in range(0, num_particle_ids, 1):
            particle_id = particle_ids[n]
            t = np.where(p == particle_id)[0]
            #if momentum[t[0]]<min_momentum: continue
            t = t[np.argsort(np.fabs(z[t]))]

            if np.fabs(a[t[0]] - a[t[-1]]) > 1: continue
            d = ((x[t[0]] - x[t[-1]])**2 + (y[t[0]] - y[t[-1]])**2 +
                 (z[t[0]] - z[t[-1]])**2)**0.5
            if d < 10: continue

            ###print(n, particle_id)
            color = np.random.uniform(0, 1, (3))

            #ax.clear()
            #ax1.clear()

            ax.plot(data[t, 0],
                    data[t, 1],
                    '.',
                    color=color,
                    markersize=5,
                    linewidth=0)
            ax1.plot(a[t],
                     r[t],
                     z[t],
                     '.-',
                     color=color,
                     markersize=5,
                     linewidth=1)
            #ax1.plot(a[h],r[h], z[h], 'o',  color=[0,0,0], markersize=8,  linewidth=1, mfc='none')

            #ax1.view_init(0, (ax_n*3)%360)
            #ax_n += 1

            #fig1.savefig('/root/share/project/kaggle/cern/results/yy/%05d.png'%ax_n)
            #plt.pause(0.01)
            #plt.waitforbuttonpress(-1)

        #show_ax()

    ## 2. show dbscan prediction:
    if True:
        fig_ = plt.figure(figsize=(8, 8))
        ax_ = fig_.add_subplot(111, )
        fig_.patch.set_facecolor('white')

        fig1_ = plt.figure(figsize=(8, 8))
        ax1_ = fig1_.add_subplot(111, projection='3d')
        fig1_.patch.set_facecolor('white')

        ax_.clear()
        ax1_.clear()
        ax_.plot(d0,
                 d1,
                 '.',
                 color=[0.75, 0.75, 0.75],
                 markersize=3,
                 linewidth=0)
        ax1_.plot(a,
                  r,
                  z,
                  '.',
                  color=[0.75, 0.75, 0.75],
                  markersize=3,
                  linewidth=0)

        ax.set_title('DBSCAN Prediction')
        ax1.set_title('DBSCAN Prediction')

        ax1_.set_xlabel('a', fontsize=16)
        ax1_.set_ylabel('r', fontsize=16)
        ax1_.set_zlabel('z', fontsize=16)
        ax_.set_xlabel('a', fontsize=16)
        ax_.set_ylabel('z/r', fontsize=16)

        for n in range(0, num_track_ids, 1):
            track_id = track_ids[n]
            t = np.where(l == track_id)[0]
            #if momentum[t[0]]<min_momentum: continue
            t = t[np.argsort(np.fabs(z[t]))]

            if np.fabs(a[t[0]] - a[t[-1]]) > 1: continue
            d = ((x[t[0]] - x[t[-1]])**2 + (y[t[0]] - y[t[-1]])**2 +
                 (z[t[0]] - z[t[-1]])**2)**0.5
            if d < 10: continue

            ###print(n, track_id)
            color = np.random.uniform(0, 1, (3))

            #ax.clear()
            #ax1.clear()

            ax_.plot(data[t, 0],
                     data[t, 1],
                     '.',
                     color=color,
                     markersize=5,
                     linewidth=0)
            ax1_.plot(a[t],
                      r[t],
                      z[t],
                      '.-',
                      color=color,
                      markersize=5,
                      linewidth=1)
            #ax1.plot(a[h],r[h], z[h], 'o',  color=[0,0,0], markersize=8,  linewidth=1, mfc='none')

            #ax1.view_init(0, (ax_n*3)%360)
            #ax_n += 1

            #fig1.savefig('/root/share/project/kaggle/cern/results/yy/%05d.png'%ax_n)
            #plt.pause(0.01)
            #plt.waitforbuttonpress(-1)

        #show_ax()
        #plt.show()

    ################################################################################################

    # analysis ...
    ## <to be updated> ...

    results = results.assign(
        detected=(results.count_both > results.count_particle)
        & (results.count_both > results.count_track))

    detected = results.loc[results.detected == True]
    missed = results.loc[(results.detected == False) &
                         (results.count_track < results.count_particle * 0.5)]
    fp = results.loc[(results.detected == False)
                     & (results.count_track > results.count_particle * 0.5)]

    detected = np.unique(detected.particle_id.values)
    missed = np.unique(missed.particle_id.values)
    fp = np.unique(fp.track_id.values)

    detected = detected[detected != 0]
    missed = missed[missed != 0]
    fp = fp[fp != 0]

    num_detected = len(detected)
    num_missed = len(missed)
    num_fp = len(fp)

    #shows detected tracks
    for (p, q) in [(p, detected), (p, missed), (l, fp)]:
        fig_ = plt.figure(figsize=(8, 8))
        ax_ = fig_.add_subplot(111, )
        fig_.patch.set_facecolor('white')

        fig1_ = plt.figure(figsize=(8, 8))
        ax1_ = fig1_.add_subplot(111, projection='3d')
        fig1_.patch.set_facecolor('white')

        ax_.clear()
        ax1_.clear()
        ax_.plot(d0,
                 d1,
                 '.',
                 color=[0.75, 0.75, 0.75],
                 markersize=3,
                 linewidth=0)
        ax1_.plot(a,
                  r,
                  z,
                  '.',
                  color=[0.75, 0.75, 0.75],
                  markersize=3,
                  linewidth=0)

        ax1_.set_xlabel('a', fontsize=16)
        ax1_.set_ylabel('r', fontsize=16)
        ax1_.set_zlabel('z', fontsize=16)
        ax_.set_xlabel('a', fontsize=16)
        ax_.set_ylabel('z/r', fontsize=16)

        for n in range(0, len(q), 1):
            t = np.where(p == q[n])[0]
            #if momentum[t[0]]<min_momentum: continue
            t = t[np.argsort(np.fabs(z[t]))]

            if np.fabs(a[t[0]] - a[t[-1]]) > 1: continue
            d = ((x[t[0]] - x[t[-1]])**2 + (y[t[0]] - y[t[-1]])**2 +
                 (z[t[0]] - z[t[-1]])**2)**0.5
            if d < 10: continue

            ##print(n, track_id)
            color = np.random.uniform(0, 1, (3))

            #ax.clear()
            #ax1.clear()

            ax_.plot(data[t, 0],
                     data[t, 1],
                     '.',
                     color=color,
                     markersize=5,
                     linewidth=0)
            ax1_.plot(a[t],
                      r[t],
                      z[t],
                      '.-',
                      color=color,
                      markersize=5,
                      linewidth=1)
            #plt.pause(0.01)

    plt.show()

    zz = 0

    exit(0)

Example #31

0

Show file

def make_data(
        a,
        zr,
        z,
        my_layer_id,
        p,
        # a_limit=(1.0,3.0), zr_limit=(4.0,7.0),
        a_limit=(1.0, 2.0),
        zr_limit=(4.0, 5.0),
        depth=6):
    a0, a1 = a_limit
    zr0, zr1 = zr_limit

    idx = np.where((a >= a0) & (a < a1) & (zr >= zr0) & (zr < zr1))[0]
    aa, zzr, zz = a[idx], zr[idx], z[idx] / 1000
    ll = my_layer_id[idx]
    pp = p[idx]

    data3 = np.column_stack((aa, zzr, zz))
    L = len(data3)

    pairs = []
    for d in range(depth - 1):
        i0 = np.where(ll == d)[0]
        i1 = np.where(ll == d + 1)[0]

        L0 = len(i0)
        L1 = len(i1)
        if L0 == 0: continue
        if L1 == 0: continue

        q0 = data3[i0]
        q1 = data3[i1]
        qq0 = np.repeat(q0.reshape(L0, 1, 3), L1, axis=1).reshape(-1, 3)
        qq1 = np.repeat(q1.reshape(1, L1, 3), L0, axis=0).reshape(-1, 3)
        ii0 = np.repeat(i0.reshape(L0, 1), L1, axis=1).reshape(-1, 1)
        ii1 = np.repeat(i1.reshape(1, L1), L0, axis=0).reshape(-1, 1)

        unit = qq1 - qq0
        unit = unit / np.sqrt((unit**2).sum(1, keepdims=True))
        ii = np.zeros((L0 * L1, 1), np.int32)

        pair = np.concatenate((ii0, ii1, ii, qq0, qq1, unit), 1)
        pairs.append(pair)

    P = len(pairs)
    M = 0
    for p in pairs:
        dM = len(p)
        p[:, 2] = np.arange(M, M + dM)
        M += dM

    distance = np.full((M, M), 100, np.float32)  #INF
    for d in range(P - 1):
        for a in pairs[d]:
            ai0, ai1, ai = a[:3].astype(np.int32)
            ap, aq, aunit = np.split(a[3:], 3)
            if ((np.fabs(aunit[0]) > 0.25) | (np.fabs(aunit[1]) > 0.25)):
                continue

            b = pairs[d + 1]
            i = (np.where((b[:, 0] == ai1)))[0]

            bi = (b[:, 2][i]).astype(np.int32)
            dis = np.sqrt(((b[:, -3:][i] - aunit)**2).sum(1))
            distance[ai, bi] = dis

    print('dbscan')
    _, l = dbscan(distance, eps=0.080, min_samples=1, metric='precomputed')
    cluster_id = np.unique(l + 1)
    #cluster_id = cluster_id[cluster_id!=0]
    num_cluster_id = len(cluster_id)

    ## draw clustering results -----------------------------------
    print('draw clustering results')
    pairs_flat = np.vstack(pairs)

    AX3d1.clear()
    AX3d1.scatter(aa,
                  zzr,
                  zz,
                  c=plt.cm.gnuplot(ll / depth),
                  s=16,
                  edgecolors='none')
    plot3d_particles(AX3d1,
                     aa,
                     zzr,
                     zz,
                     zz,
                     pp,
                     subsample=1,
                     color=[0, 0, 0],
                     linewidth=4)

    for id in cluster_id:
        #AX3d1.clear()
        #AX3d1.scatter(aa, zzr,  zz, c=plt.cm.gnuplot( ll/depth ), s=16, edgecolors='none')

        t = np.where(l == id)
        t0 = pairs_flat[t, 0].astype(np.int32).reshape(-1)
        t1 = pairs_flat[t, 1].astype(np.int32).reshape(-1)
        t = np.unique(np.concatenate((t0, t1)))
        #if len(t0)<3: continue

        color = np.random.uniform(0, 1, 3)
        #AX3d1.plot(aa[t0], zzr[t0],  zz[t0],'.-', color=color, markersize=15) #edgecolors=
        #AX3d1.plot(aa[t1], zzr[t1],  zz[t1],'.-', color=color, markersize=15)
        AX3d1.plot(aa[t], zzr[t], zz[t], '.-', color=color, markersize=15)
        #plt.pause(0.01)
        #plt.waitforbuttonpress(-1)
    plt.show()

    return 0

Example #32

0

Show file

def test_dbscan_badargs(args):
    # Test bad argument values: these should all raise ValueErrors
    with pytest.raises(ValueError):
        dbscan(X, **args)

Example #33

0

Show file

    def predict(self, dfh):
        print("size(dfh): {0}".format(len(dfh)))

        if ("rt" not in dfh.columns):
            dfh["rt"] = np.sqrt(dfh['x'].values**2 + dfh['y'].values**2)

        z = dfh['z'].values
        rt = dfh["rt"].values
        a0 = np.arctan2(dfh['y'].values, dfh['x'].values)
        layer_id = dfh['layer_id'].values.astype(np.float32)

        sys.stderr.write("dbscan for each (z,a) shifting\n")
        scan_labels = []
        for (dj, di) in tqdm(product(self.djs, self.dis),
                             total=len(self.djs) * len(self.dis)):
            ar = a0 + di * rt
            zr = (z + dj) / rt * 0.1
            data2 = np.column_stack([ar, zr])

            _, scan_label = dbscan(
                data2,
                eps=0.0025,
                min_samples=1,
            )
            scan_labels.append(scan_label)

        sys.stderr.write("make candidates\n")
        candidates = []
        for scan_label in tqdm(scan_labels):
            l = scan_label
            unique = np.unique(l)
            for u in unique:
                candidate = np.where(l == u)[0]
                candidates.append(candidate)

        print("# of candidates : {0}".format(len(candidates)))
        count = np.array([len(candidate) for candidate in candidates])
        sort = np.argsort(-count)
        candidates = [candidates[s] for s in sort]

        max_label = 1
        N = len(dfh)
        label = np.zeros(N, np.int32)
        count = np.zeros(N, np.int32)

        sys.stderr.write("calculate clustering label from candidates\n")
        for candidate in tqdm(candidates):
            n = candidate
            L = len(n)

            n = n[np.argsort(np.fabs(z[n]))]
            layer_id0 = layer_id[n[:-1]]
            layer_id1 = layer_id[n[1:]]
            ld = layer_id1 - layer_id0
            if np.any(ld > 2): continue

            m = count[n].max()
            if L < m: continue

            count[n] = L
            label[n] = max_label
            max_label += 1

        return label