Example #1
0
def test_dbscan_core_samples_toy():
    X = [[0], [2], [3], [4], [6], [8], [10]]
    n_samples = len(X)

    for algorithm in ['brute', 'kd_tree', 'ball_tree']:
        # Degenerate case: every sample is a core sample, either with its own
        # cluster or including other close core samples.
        core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
                                      min_samples=1)
        assert_array_equal(core_samples, np.arange(n_samples))
        assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])

        # With eps=1 and min_samples=2 only the 3 samples from the denser area
        # are core samples. All other points are isolated and considered noise.
        core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
                                      min_samples=2)
        assert_array_equal(core_samples, [1, 2, 3])
        assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])

        # Only the sample in the middle of the dense area is core. Its two
        # neighbors are edge samples. Remaining samples are noise.
        core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
                                      min_samples=3)
        assert_array_equal(core_samples, [2])
        assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])

        # It's no longer possible to extract core samples with eps=1:
        # everything is noise.
        core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
                                      min_samples=4)
        assert_array_equal(core_samples, [])
        assert_array_equal(labels, -np.ones(n_samples))
Example #2
0
def test_dbscan_sparse():
    core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X),
                                        eps=.8,
                                        min_samples=10)
    core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10)
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse():
    core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=.8,
                                        min_samples=10, random_state=0)
    core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10,
                                      random_state=0)
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)
Example #4
0
def test_boundaries():
    # ensure min_samples is inclusive of core point
    core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
    assert_in(0, core)
    # ensure eps is inclusive of circumference
    core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
    assert_in(0, core)
    core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
    assert_not_in(0, core)
def test_dbscan_sparse_precomputed():
    D = pairwise_distances(X)
    core_sparse, labels_sparse = dbscan(sparse.lil_matrix(D),
                                        eps=.8,
                                        min_samples=10,
                                        metric='precomputed')
    core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10)
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse_precomputed():
    D = pairwise_distances(X)
    nn = NearestNeighbors(radius=0.9).fit(X)
    D_sparse = nn.radius_neighbors_graph(mode="distance")
    # Ensure it is sparse not merely on diagonals:
    assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
    core_sparse, labels_sparse = dbscan(D_sparse, eps=0.8, min_samples=10, metric="precomputed")
    core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed")
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)
Example #7
0
def test_dbscan_input_not_modified(use_sparse, metric):
    # test that the input is not modified by dbscan
    X = np.random.RandomState(0).rand(10, 10)
    X = sparse.csr_matrix(X) if use_sparse else X
    X_copy = X.copy()
    dbscan(X, metric=metric)

    if use_sparse:
        assert_array_equal(X.toarray(), X_copy.toarray())
    else:
        assert_array_equal(X, X_copy)
Example #8
0
def test_dbscan_sparse_precomputed():
    D = pairwise_distances(X)
    nn = NearestNeighbors(radius=.9).fit(X)
    D_sparse = nn.radius_neighbors_graph(mode='distance')
    # Ensure it is sparse not merely on diagonals:
    assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
    core_sparse, labels_sparse = dbscan(D_sparse,
                                        eps=.8,
                                        min_samples=10,
                                        metric='precomputed')
    core_dense, labels_dense = dbscan(D, eps=.8, min_samples=10,
                                      metric='precomputed')
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)
Example #9
0
def test_dbscan_sparse_precomputed_different_eps():
    # test that precomputed neighbors graph is filtered if computed with
    # a radius larger than DBSCAN's eps.
    lower_eps = 0.2
    nn = NearestNeighbors(radius=lower_eps).fit(X)
    D_sparse = nn.radius_neighbors_graph(X, mode='distance')
    dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric='precomputed')

    higher_eps = lower_eps + 0.7
    nn = NearestNeighbors(radius=higher_eps).fit(X)
    D_sparse = nn.radius_neighbors_graph(X, mode='distance')
    dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric='precomputed')

    assert_array_equal(dbscan_lower[0], dbscan_higher[0])
    assert_array_equal(dbscan_lower[1], dbscan_higher[1])
def test_dbscan_balltree():
    # Tests the DBSCAN algorithm with balltree for neighbor calculation.
    eps = 0.8
    min_samples = 10

    D = pairwise_distances(X)
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree")
    labels = db.fit(X).labels_

    n_clusters_3 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_3, n_clusters)

    db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
    labels = db.fit(X).labels_

    n_clusters_4 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_4, n_clusters)

    db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree")
    labels = db.fit(X).labels_

    n_clusters_5 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_5, n_clusters)
Example #11
0
def angle_predict(hits,m,i,rz_shift,eps,weights):
    aa = hits.a+m*(hits.r+0.000005*(hits.r**2))/1000*(i/2)/180*3.141
            
    hits['f0'] = np.sin(aa)
    hits['f1'] = np.cos(aa)
    
    hits_b = hits[hits.type == 'b']['hit_id']
    hits_c = hits[hits.type == 'c']['hit_id']

    ss = StandardScaler()
    X = ss.fit_transform(np.column_stack([hits.f0.values, hits.f1.values, hits.z1.values, hits.z2.values, hits.xr.values, hits.yr.values]))
    
    X_b = np.multiply(np.vstack([X[ex-1] for ex in hits_b.values]),weights[0])
    X_c = np.multiply(np.vstack([X[ex-1] for ex in hits_c.values]),weights[1])

    Xw = np.zeros(X.shape)

    Xw[hits_b.values-1] = X_b[range(len(hits_b.values))]
    Xw[hits_c.values-1] = X_c[range(len(hits_c.values))]

    eps = eps + (i*0.000005)

    _,labels = dbscan(Xw, eps=eps, min_samples=1, algorithm='auto', n_jobs=4)

    unique,reverse,count = np.unique(labels,return_counts=True,return_inverse=True)
    c = count[reverse]
    c[np.where(labels==0)]=0
    if abs(rz_shift) < 0.1:
        c[np.where(c>20)]=0
    else:
        c[np.where(c>8)]=0
    return (labels,c)
Example #12
0
def get_features(sub, cluster_size=10):
    """
    Input: dataframe with hits long tracks
    Output: array with features of long track
    """
    hitst = sub.copy()
    X = np.column_stack([
        hitst.x.values, hitst.y.values, hitst.z.values,
        hitst.track_id.values * 1000000
    ])
    _, hitst['labels'] = dbscan(X,
                                eps=cluster_size,
                                min_samples=1,
                                algorithm='ball_tree',
                                metric='euclidean')
    gp = hitst.groupby('track_id').agg({
        'hit_id': 'count',
        'labels': 'nunique',
        'volume_id': 'min',
        'x': ['min', 'max', 'var'],
        'y': ['min', 'max', 'var'],
        'z': ['min', 'max', 'var', 'mean']
    })

    gp.columns = ["".join(t) for t in gp.columns.ravel()]
    gp = gp.rename(
        columns={
            'hit_idcount': 'nhits',
            'labelsnunique': 'nclusters',
            'volume_idmin': 'svolume'
        }).reset_index()
    gp['nhitspercluster'] = gp.nhits / gp.nclusters
    return gp
Example #13
0
def test_dbscan_callable():
    # Tests the DBSCAN algorithm with a callable metric.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    # metric is the function reference, not the string key.
    metric = distance.euclidean
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X,
                                  metric=metric,
                                  eps=eps,
                                  min_samples=min_samples,
                                  algorithm='ball_tree')

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric=metric,
                eps=eps,
                min_samples=min_samples,
                algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Example #14
0
    def predict(self, dfh):
        print("len(dfh) : {0}".format(len(dfh)))

        if ("rt" not in dfh.columns):
            dfh["rt"] = np.sqrt(dfh['x'].values**2 + dfh['y'].values**2)

        z = dfh['z'].values
        rt = dfh["rt"].values
        r = np.sqrt(dfh['x']**2 + dfh['y']**2 + dfh['z']**2)
        a0 = np.arctan2(dfh['y'].values, dfh['x'].values)
        layer_id = dfh['layer_id'].values.astype(np.float32)

        sys.stderr.write("dbscan for each parameters\n")
        scan_labels_list = []
        for (dj, di) in tqdm(product(self.djs, self.dis),
                             total=len(self.djs) * len(self.dis)):
            ar = a0 + di * rt
            zr = (z + dj) / rt * 0.1
            if (self.param_type == 0):
                params = [ar, zr]
            elif (self.param_type == 1):
                params = [np.sin(ar), np.cos(ar), zr * 10, 1 / (10 * zr)]
            elif (self.param_type == 2):
                params = [np.sin(ar), np.cos(ar), zr]
            else:
                raise RuntimeError("invalid param_type.")

            if (self.weight is None):
                w = np.array([1.0 for _ in params])
            else:
                w = np.array(self.weight)

            ss = StandardScaler()
            data1 = ss.fit_transform(np.column_stack(params))
            data2 = w[np.newaxis, :] * data1

            _, scan_label = dbscan(
                data2,
                eps=self.eps,
                min_samples=1,
            )
            scan_labels_list.append(scan_label)

        sys.stderr.write("clustering\n")
        dfh["s1"] = dfh.hit_id
        dfh["N1"] = 1
        for scan_labels in scan_labels_list:
            dfh["s2"] = scan_labels
            dfh["N2"] = dfh.groupby('s2')['s2'].transform('count')
            maxs1 = np.max(dfh.s1)
            dfh.s1 = np.where((dfh.N2 > dfh.N1) & (dfh.N2 < 20),
                              dfh.s2 + maxs1, dfh.s1)
            dfh['s1'] = dfh['s1'].astype('int64')
            dfh['N1'] = dfh.groupby('s1')['s1'].transform('count')
        labels = dfh['s1']

        return labels
Example #15
0
    def predict(self, hits, weights):
        x = hits.x.values
        y = hits.y.values
        z = self.rz_scale * hits.z.values

        r = np.sqrt(x**2 + y**2)
        d = np.sqrt(x**2 + y**2 + z**2)
        a = np.arctan2(y, x)
        zr = z / r
        dr = d / r
        hits['d'] = d

        w0, w1, w2, w3, w4 = weights

        ss = StandardScaler()

        results = []
        dzi = -0.00010
        for step in [11]:  #range(21): #0.00060/121/-60
            dz = dzi + (step * 0.00001)
            f0 = w0 * (a + (dz * z * np.sign(z)))
            f1 = w1 * (zr)
            f2 = w2 * (f0 / zr)
            f3 = w3 * (1 / zr)
            f4 = w4 * (f2 + f3)

            X = ss.fit_transform(np.column_stack([f0, f1, f2, f3, f4]))

            eps = self.eps - (abs(step - 10) * 0.000015)

            _, labels = dbscan(X,
                               eps=eps,
                               min_samples=1,
                               algorithm='auto',
                               n_jobs=4)

            unique, reverse, count = np.unique(labels,
                                               return_counts=True,
                                               return_inverse=True)
            c = count[reverse]
            c[np.where(labels == 0)] = 0
            c[np.where(c > 20)] = 0
            results.append((labels, c))

        labels, counts = results[0]

        for i in range(1, len(results)):
            l, c = results[i]
            idx = np.where((c - counts > 0))[0]
            labels[idx] = l[idx] + labels.max()
            counts[idx] = c[idx]

        return labels
Example #16
0
def find_labels(params):
    hits, dz = params
    a = hits['phi'].values
    z = hits['z'].values
    zr = hits['zr'].values
    aa = a + np.sign(z) * dz * z

    f0 = np.cos(aa)
    f1 = np.sin(aa)
    f2 = zr
    X = StandardScaler().fit_transform(np.column_stack([f0, f1, f2]))

    _, l = dbscan(X, eps=0.0045, min_samples=1, n_jobs=4)
    return l + 1
Example #17
0
def seed_tracks(event_id, df, start_d, end_d, ax):
    seed = df.loc[df.d > start_d]
    seed = seed.loc[seed.d < end_d]
    N = len(seed)

    p = seed[['particle_id']].values.astype(np.int64)
    x, y, z, r, a, cosa, sina, phi = seed[[
        'x', 'y', 'z', 'r', 'a', 'cosa', 'sina', 'phi'
    ]].values.astype(np.float32).T

    particle_ids = np.unique(p)
    particle_ids = particle_ids[particle_ids != 0]
    num_particle_ids = len(particle_ids)

    # do dbscan here =======================================
    data = np.column_stack([a, z / r * 0.1])

    _, l = dbscan(
        data,
        eps=0.01,
        min_samples=1,
    )

    #print(len(truth))
    #print(len(seed))
    #print(len(submission))
    #print(len(l))

    seed['l'] = pd.Series(l, index=seed.index)
    #print(seed)
    submission = pd.DataFrame(columns=['event_id', 'hit_id', 'track_id'],
                              data=np.column_stack(([
                                  int(event_id),
                              ] * len(seed), seed.hit_id.values,
                                                    l))).astype(int)

    score = score_event_fast(seed, submission)
    print(score)

    predicted_tracks, counts = np.unique(l, return_counts=True)
    predicted_tracks = predicted_tracks[counts > 1]

    for predicted_track in predicted_tracks[::100]:
        track_hits = seed[seed.l == predicted_track]
        ax.plot(xs=track_hits.a, ys=track_hits.r, zs=track_hits.z)
def get_base_partitioning(distance_matrix, eps=0.5, min_samples=2):
  """ Gets the base partitioning from the distance matrix using DBScan
    algorithm.

  Args:
    distance_matrix: a list of lists with the distances of references.

  Returns:
    A list of integers from 0 to k - 1, each one representing a block for the
      reference represented by the index.
  """
  labels = dbscan(np.array(distance_matrix), metric='precomputed', eps=eps, 
      min_samples=min_samples)
  next_label = max(labels[1]) + 1
  for i in range(len(labels[1])):
    if labels[1][i] == -1:
      labels[1][i] = next_label
      next_label += 1
  return labels[1].tolist(), number_of_clusters(labels[1])
def test_dbscan_similarity():
    # Tests the DBSCAN algorithm with a similarity array.
    # Parameters chosen specifically for this task.
    eps = 0.15
    min_samples = 10
    # Compute similarities
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)
    # Compute DBSCAN
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)

    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
    labels = db.fit(D).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
def test_dbscan_feature():
    # Tests the DBSCAN algorithm with a feature vector array.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    metric = "euclidean"
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Example #21
0
def test_dbscan_balltree():
    # Tests the DBSCAN algorithm with balltree for neighbor calculation.
    eps = 0.8
    min_samples = 10

    D = pairwise_distances(X)
    core_samples, labels = dbscan(D,
                                  metric="precomputed",
                                  eps=eps,
                                  min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree')
    labels = db.fit(X).labels_

    n_clusters_3 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_3, n_clusters)

    db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_4 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_4, n_clusters)

    db = DBSCAN(leaf_size=20,
                eps=eps,
                min_samples=min_samples,
                algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_5 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_5, n_clusters)
def test_dbscan_callable():
    # Tests the DBSCAN algorithm with a callable metric.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    # metric is the function reference, not the string key.
    metric = distance.euclidean
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree")

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree")
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
def get_base_partitioning(distance_matrix, eps=0.5, min_samples=2):
    """ Gets the base partitioning from the distance matrix using DBScan
    algorithm.

  Args:
    distance_matrix: a list of lists with the distances of references.

  Returns:
    A list of integers from 0 to k - 1, each one representing a block for the
      reference represented by the index.
  """
    labels = dbscan(np.array(distance_matrix),
                    metric='precomputed',
                    eps=eps,
                    min_samples=min_samples)
    next_label = max(labels[1]) + 1
    for i in range(len(labels[1])):
        if labels[1][i] == -1:
            labels[1][i] = next_label
            next_label += 1
    return labels[1].tolist(), number_of_clusters(labels[1])
Example #24
0
def test_dbscan_feature():
    # Tests the DBSCAN algorithm with a feature vector array.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    metric = 'euclidean'
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X, metric=metric, eps=eps,
                                  min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Example #25
0
def test_dbscan_similarity():
    # Tests the DBSCAN algorithm with a similarity array.
    # Parameters chosen specifically for this task.
    eps = 0.15
    min_samples = 10
    # Compute similarities
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)
    # Compute DBSCAN
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
                                  min_samples=min_samples)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)

    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
    labels = db.fit(D).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Example #26
0
def img_association(network, propagate_loader, min_sample=4, eps=0,
                    rerank=False, k1=20, k2=6, intra_id_reinitialize=False):

    network.eval()
    print('Start Inference...')
    features = []
    global_labels = []
    all_cams = []

    with torch.no_grad():
        for c, data in enumerate(propagate_loader):
            images = data[0]
            g_label = data[3]
            cam = data[4]

            embed_feat = network(images)
            features.append(embed_feat.cpu())

            global_labels.append(g_label)
            all_cams.append(cam)

    features = torch.cat(features, dim=0).numpy()
    global_labels = torch.cat(global_labels, dim=0).numpy()
    all_cams = torch.cat(all_cams, dim=0).numpy()
    print('  features: shape= {}'.format(features.shape))

    # if needed, average camera-style transferred image features
    new_features = []
    new_cams = []
    for glab in np.unique(global_labels):
        idx = np.where(global_labels == glab)[0]
        new_features.append(np.mean(features[idx], axis=0))
        new_cams.append(all_cams[idx])

    new_features = np.array(new_features)
    new_cams = np.array(new_cams).squeeze()
    del features, all_cams

    # compute distance W
    new_features = new_features / np.linalg.norm(new_features, axis=1, keepdims=True)  # l2-normalize
    if rerank:
        W = faiss_compute_jaccard_dist(torch.from_numpy(new_features), k1=k1, k2=k2)
    else:
        W = cdist(new_features, new_features, 'euclidean')
    print('  distance matrix: shape= {}'.format(W.shape))

    # self-similarity for association
    print('  perform image grouping...')
    _, updated_label = dbscan(W, eps=eps, min_samples=min_sample, metric='precomputed', n_jobs=8)
    print('  eps in cluster: {:.3f}'.format(eps))
    print('  updated_label: num_class= {}, {}/{} images are associated.'
          .format(updated_label.max() + 1, len(updated_label[updated_label >= 0]), len(updated_label)))

    if intra_id_reinitialize:
        print('re-computing initialized intra-ID feature...')
        intra_id_features = []
        intra_id_labels = []
        for cc in np.unique(new_cams):
            percam_ind = np.where(new_cams == cc)[0]
            percam_feature = new_features[percam_ind, :]
            percam_label = updated_label[percam_ind]
            percam_class_num = len(np.unique(percam_label[percam_label >= 0]))
            percam_id_feature = np.zeros((percam_class_num, percam_feature.shape[1]), dtype=np.float32)
            cnt = 0
            for lbl in np.unique(percam_label):
                if lbl >= 0:
                    ind = np.where(percam_label == lbl)[0]
                    id_feat = np.mean(percam_feature[ind], axis=0)
                    percam_id_feature[cnt, :] = id_feat
                    intra_id_labels.append(lbl)
                    cnt += 1
            percam_id_feature = percam_id_feature / np.linalg.norm(percam_id_feature, axis=1, keepdims=True)
            intra_id_features.append(torch.from_numpy(percam_id_feature))
        return updated_label, intra_id_features
Example #27
0
def copac(X,
          k=10,
          mu=5,
          eps=0.5,
          alpha=0.85,
          metric='euclidean',
          metric_params=None,
          algorithm='auto',
          leaf_size=30,
          p=None,
          n_jobs=1,
          sample_weight=None):
    """Perform COPAC clustering from vector array.

    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        A feature array.
    k : int, optional, default=10
        Size of local neighborhood for local correlation dimensionality.
        The paper suggests k >= 3 * n_features.
    mu : int, optional, default=5
        Minimum number of points in a copac with mu <= k.
    eps : float, optional, default=0.5
        Neighborhood predicate, so that neighbors are closer than `eps`.
    alpha : float in ]0,1[, optional, default=0.85
        Threshold of how much variance needs to be explained by Eigenvalues.
        Assumed to be robust in range 0.8 <= alpha <= 0.9 [see Ref.]
    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by sklearn.metrics.pairwise.pairwise_distances
        for its metric parameter.
        If metric is "precomputed", `X` is assumed to be a distance matrix and
        must be square.
    metric_params : dict, optional
        Additional keyword arguments for the metric function.
    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        The algorithm to be used by the scikit-learn NearestNeighbors module
        to compute pointwise distances and find nearest neighbors.
        See NearestNeighbors module documentation for details.
    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or cKDTree. This can affect the speed
        of the construction and query, as well as the memory required
        to store the tree. The optimal value depends
        on the nature of the problem.
    p : float, optional
        The power of the Minkowski metric to be used to calculate distance
        between points.
    n_jobs : int, optional, default=1
        Number of parallel processes. Use all cores with n_jobs=-1.
    sample_weight : None
        Currently ignored

    Returns
    -------
    labels : array [n_samples]
        Cluster labels for each point. Noisy samples are given the label -1.

    References
    ----------
    Elke Achtert, Christian Bohm, Hans-Peter Kriegel, Peer Kroger,
    A. Z. (n.d.). Robust, complete, and efficient correlation
    clustering. In Proceedings of the Seventh SIAM International
    Conference on Data Mining, April 26-28, 2007, Minneapolis,
    Minnesota, USA (2007), pp. 413–418.
    """
    X = check_array(X)
    n, d = X.shape
    y = -np.ones(n, dtype=np.int)
    if n_jobs == -1:
        n_jobs = cpu_count()

    # Calculating M^ just once requires more memory, but saves computation
    lambda_ = np.zeros(n, dtype=int)
    M_hat = list()

    # Get nearest neighbors
    nn = NearestNeighbors(n_neighbors=k,
                          metric=metric,
                          algorithm=algorithm,
                          leaf_size=leaf_size,
                          metric_params=metric_params,
                          p=p,
                          n_jobs=n_jobs)
    nn.fit(X)
    knns = nn.kneighbors(return_distance=False)
    for P, knn in enumerate(knns):
        N_P = X[knn]

        # Correlation copac covariance matrix
        Sigma = np.cov(N_P[:, :], rowvar=False, ddof=0)

        # Decompose spsd matrix, and sort Eigenvalues descending
        E, V = LA.eigh(Sigma)
        E = np.sort(E)[::-1]

        # Local correlation dimension
        explanation_portion = np.cumsum(E) / E.sum()
        lambda_P = np.searchsorted(explanation_portion, alpha, side='left')
        lambda_P += 1
        lambda_[P] = lambda_P
        # Correlation distance matrix
        E_hat = (np.arange(1, d + 1) > lambda_P).astype(int)
        M_hat.append(V @ np.diag(E_hat) @ V.T)

    # Group points by corr. dim.
    argsorted = np.argsort(lambda_)
    edges, _ = np.histogram(lambda_[argsorted], bins=np.arange(1, d + 2))
    Ds = np.split(argsorted, np.cumsum(edges))
    # Loop over partitions according to local corr. dim.
    max_label = 0
    used_y = np.zeros_like(y, dtype=int)
    for D in Ds:
        n_D = D.shape[0]
        cdist_P = -np.ones(n_D * (n_D - 1) // 2, dtype=np.float)
        cdist_Q = -np.ones((n_D, n_D), dtype=np.float)
        start = 0
        # Calculate triu part of distance matrix
        for i in range(0, n_D - 1):
            p = D[i]
            # Vectorized inner loop
            q = D[i + 1:n_D]
            stop = start + n_D - i - 1
            cdist_P[start:stop] = _cdist(X[p], X[q], M_hat[p])
            start = stop
        # Calculate tril part of distance matrix
        for i in range(1, n_D):
            q = D[i]
            p = D[0:i]
            cdist_Q[i, :i] = _cdist(X[q], X[p], M_hat[q])
        # Extract tril to 1D array
        # TODO simplify...
        cdist_Q = cdist_Q.T[np.triu_indices_from(cdist_Q, k=1)]
        cdist = np.block([[cdist_P], [cdist_Q]])
        # Square root of the higher value of cdist_P, cdist_Q
        cdist = np.sqrt(cdist.max(axis=0))

        # Perform DBSCAN with full distance matrix
        cdist = squareform(cdist)
        clust = dbscan(X=cdist,
                       eps=eps,
                       min_samples=mu,
                       metric='precomputed',
                       n_jobs=n_jobs)
        _, labels = clust
        # Each DBSCAN run is unaware of previous ones,
        # so we need to keep track of previous copac IDs
        y_D = labels + max_label
        new_labels = np.unique(labels[labels >= 0]).size
        max_label += new_labels
        # Set copac labels in `y`
        y[D] = y_D
        used_y[D] += 1
    assert np.all(used_y == 1), "Not all samples were handled exactly once!"
    return y
Example #28
0
def test_weighted_dbscan():
    # ensure sample_weight is validated
    assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2])
    assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4])

    # ensure sample_weight has an effect
    assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
                                  min_samples=6)[0])
    assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
                                  min_samples=6)[0])
    assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
                                   min_samples=6)[0])
    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
                                      min_samples=6)[0])

    # points within eps of each other:
    assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
                                      sample_weight=[5, 1], min_samples=6)[0])
    # and effect of non-positive and non-integer sample_weight:
    assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
                                  eps=1.5, min_samples=6)[0])
    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1],
                                      eps=1.5, min_samples=6)[0])
    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
                                      eps=1.5, min_samples=6)[0])
    assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
                                  eps=1.5, min_samples=6)[0])

    # for non-negative sample_weight, cores should be identical to repetition
    rng = np.random.RandomState(42)
    sample_weight = rng.randint(0, 5, X.shape[0])
    core1, label1 = dbscan(X, sample_weight=sample_weight)
    assert_equal(len(label1), len(X))

    X_repeated = np.repeat(X, sample_weight, axis=0)
    core_repeated, label_repeated = dbscan(X_repeated)
    core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
    core_repeated_mask[core_repeated] = True
    core_mask = np.zeros(X.shape[0], dtype=bool)
    core_mask[core1] = True
    assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)

    # sample_weight should work with precomputed distance matrix
    D = pairwise_distances(X)
    core3, label3 = dbscan(D, sample_weight=sample_weight,
                           metric='precomputed')
    assert_array_equal(core1, core3)
    assert_array_equal(label1, label3)

    # sample_weight should work with estimator
    est = DBSCAN().fit(X, sample_weight=sample_weight)
    core4 = est.core_sample_indices_
    label4 = est.labels_
    assert_array_equal(core1, core4)
    assert_array_equal(label1, label4)

    est = DBSCAN()
    label5 = est.fit_predict(X, sample_weight=sample_weight)
    core5 = est.core_sample_indices_
    assert_array_equal(core1, core5)
    assert_array_equal(label1, label5)
    assert_array_equal(label1, est.labels_)
Example #29
0
def study_dbscan_for_tracklet_seeding():

    ## load an event ---
    event_id = '000001029'

    data_dir = '/root/share/project/kaggle/cern/data/__download__/train_100_events'
    #detectors = pd.read_csv('/root/share/project/kaggle/cern/data/__download__/detectors.csv')
    particles = pd.read_csv(data_dir + '/event%s-particles.csv' % event_id)
    hits = pd.read_csv(data_dir + '/event%s-hits.csv' % event_id)
    truth = pd.read_csv(data_dir + '/event%s-truth.csv' % event_id)
    #cells = pd.read_csv(data_dir + '/event%s-cells.csv'%event_id)

    truth = truth.merge(hits, on=['hit_id'], how='left')
    truth = truth.merge(particles, on=['particle_id'], how='left')

    #--------------------------------------------------------
    df = truth.copy()
    df = df.assign(r=np.sqrt(df.x**2 + df.y**2))
    df = df.assign(d=np.sqrt(df.x**2 + df.y**2 + df.z**2))
    df = df.assign(a=np.arctan2(df.y, df.x))
    df = df.assign(cosa=np.cos(df.a))
    df = df.assign(sina=np.sin(df.a))
    df = df.assign(phi=np.arctan2(df.z, df.r))
    df = df.assign(momentum=np.sqrt(df.px**2 + df.py**2 + df.pz**2))
    df.loc[df.particle_id == 0, 'momentum'] = 0

    #df = df.loc[df.z>500] # consider dataset subset
    #df = df.loc[df.r<50 ] ## 0.04397/0.04750  (0.92569)

    df = df.loc[df.z > 500]
    df = df.loc[(df.r > 50) & (df.r < 100)]  ## 0.05259/0.05808  (0.90551)

    #df = df.loc[df.z>500]
    #df = df.loc[df.r<100] ## 0.09417/0.10557  (0.89195)

    #df = df.loc[(df.a>0) & (df.a<0.5)]
    #df = df.loc[(df.a>0) & (df.a<1)]

    # df = df.loc[df.z>500] # consider dataset subset
    # df = df.loc[(df.r>50) & (df.r<100)]

    #df = df.loc[(df.z>0) &(df.z<500)]
    #df = df.loc[df.r<200 ]
    #df = df.loc[(df.a>0) & (df.a<0.5)]
    #df = df.loc[(df.z>df.r)]
    #df = df.loc[(df.r>50) & (df.r<100) ]

    #-------------------------------------------------------
    N = len(df)

    layer_id = df['layer_id'].values.astype(np.float32)
    momentum = df['momentum'].values.astype(np.float32)
    p = df[['particle_id']].values.astype(np.int64)
    x, y, z, r, a, cosa, sina, phi = df[[
        'x', 'y', 'z', 'r', 'a', 'cosa', 'sina', 'phi'
    ]].values.astype(np.float32).T

    particle_ids = np.unique(p)
    particle_ids = particle_ids[particle_ids != 0]
    num_particle_ids = len(particle_ids)

    # do xxx =======================================
    #color = plt.cm.hsv( (z-z.min()) / (z.max()-z.min()))
    color = plt.cm.hsv(
        (layer_id - layer_id.min()) / (layer_id.max() + 1 - layer_id.min()))

    plot3d_particles(ax3d1, particle_ids, p, a, r, z, z)
    ax3d1.scatter(a, r, z, c=color, s=64, edgecolors='none')
    #plt.show()

    dj = 0
    di = 0
    EPS = 1e-12
    #if 1:

    candidates = []
    for dj in np.arange(-20, 20 + EPS, 10):
        for di in np.arange(-0.003, 0.003 + EPS, 0.00025):
            ar = a + di * r
            zr = (z + dj) / r * 0.1
            data2 = np.column_stack([ar, zr])

            _, l = dbscan(
                data2,
                eps=0.0025,
                min_samples=1,
            )
            track_ids = np.unique(l)
            track_ids = track_ids[track_ids != 0]
            neigbour = [np.where(l == t)[0] for t in track_ids]

            unique, inverse, c = np.unique(l,
                                           return_counts=True,
                                           return_inverse=True)
            unique = unique[unique != 0]
            c = c[inverse]
            c[l == 0] = 0

            for u in unique:
                candidate = np.where(l == u)[0]
                candidates.append(candidate)

    #---
    #<todo>
    #fix angle discontinunity problem here ...

    #-----
    #sort
    count = np.array([len(candidate) for candidate in candidates])
    sort = np.argsort(-count)
    candidates = [candidates[s] for s in sort]

    #show
    max_label = 1
    label = np.zeros(N, np.int32)
    count = np.zeros(N, np.int32)

    for candidate in candidates:
        n = candidate
        L = len(n)
        #print(L)

        #---- filtering (secret sauce) ----------
        #if L<3: continue
        n = n[np.argsort(np.fabs(z[n]))]

        layer_id0 = layer_id[n[:-1]]
        layer_id1 = layer_id[n[1:]]
        ld = layer_id1 - layer_id0
        if np.any(ld > 2): continue

        m = count[n].max()
        if L < m: continue

        #---- filtering ----------------------

        count[n] = L
        label[n] = max_label
        max_label += 1

        ## show:
        if L >= 3:
            #c = np.random.uniform(0,1,3)#[0,0,0]
            c = [0, 0, 0]

            #ax3d1.clear()
            #plot_particles(ax3d1, particle_ids, p, a,r,zr, z)
            #ax3d1.scatter(ar, r,  zr, c=color, s=64, edgecolors='none')
            ax3d1.plot(a[n],
                       r[n],
                       z[n],
                       '.-',
                       color=c,
                       markersize=5,
                       linewidth=1)
            #ax3d1.plot(a[[n[0],n[-1]]],r[[n[0],n[-1]]],zr[[n[0],n[-1]]],'-',  color=[1,0,0], markersize=5,  linewidth=1)

        #plt.pause(0.01)
        #plt.waitforbuttonpress(-1)
        #plt.show()

    ##-###################################################################################3
    submission = pd.DataFrame(columns=['event_id', 'hit_id', 'track_id'],
                              data=np.column_stack(([
                                  int(event_id),
                              ] * len(df), df.hit_id.values,
                                                    label))).astype(int)
    score1 = score_event(df, submission)
    score2, results = cpmp_fast_score(df, submission)

    #print results
    max_score = df.weight.sum()
    print('max_score = df.weight.sum() = %0.5f' % max_score)
    print('score1= %0.5f  (%0.5f)' % (score1 * max_score, score1))
    print('score2= %0.5f  (%0.5f)' % (score2, score2 / max_score))

    plt.show()
    print('end')
    exit(0)
Example #30
0
def study_dbscan_for_tracklet_seeding():

    ## load an event ---
    event_id = '000001029'

    path_to_train = "data/train_1"
    particles = pd.read_csv(path_to_train +
                            '/event%s-particles.csv' % event_id)
    hits = pd.read_csv(path_to_train + '/event%s-hits.csv' % event_id)
    truth = pd.read_csv(path_to_train + '/event%s-truth.csv' % event_id)
    #cells = pd.read_csv(path_to_train + '/event%s-cells.csv'%event_id)

    truth = truth.merge(hits, on=['hit_id'], how='left')
    truth = truth.merge(particles, on=['particle_id'], how='left')

    #--------------------------------------------------------
    df = truth.copy()
    df = df.assign(r=np.sqrt(df.x**2 + df.y**2))
    df = df.assign(d=np.sqrt(df.x**2 + df.y**2 + df.z**2))
    df = df.assign(a=np.arctan2(df.y, df.x))
    df = df.assign(cosa=np.cos(df.a))
    df = df.assign(sina=np.sin(df.a))
    df = df.assign(phi=np.arctan2(df.z, df.r))
    df = df.assign(momentum=np.sqrt(df.px**2 + df.py**2 + df.pz**2))
    df.loc[df.particle_id == 0, 'momentum'] = 0

    df = df.loc[df.z > 500]  # consider dataset subset
    df = df.loc[df.r < 50]
    N = len(df)

    #-------------------------------------------------------
    momentum = df[['momentum']].values.astype(np.float32)
    p = df[['particle_id']].values.astype(np.int64)
    x, y, z, r, a, cosa, sina, phi = df[[
        'x', 'y', 'z', 'r', 'a', 'cosa', 'sina', 'phi'
    ]].values.astype(np.float32).T

    particle_ids = np.unique(p)
    particle_ids = particle_ids[particle_ids != 0]
    num_particle_ids = len(particle_ids)

    # do dbscan here =======================================
    data = np.column_stack([a, z / r * 0.1])

    _, l = dbscan(
        data,
        eps=0.01,
        min_samples=1,
    )

    submission = pd.DataFrame(columns=['event_id', 'hit_id', 'track_id'],
                              data=np.column_stack(([
                                  int(event_id),
                              ] * len(df), df.hit_id.values, l))).astype(int)
    #score1 = score_event(df, submission)
    #print(df)
    #print(submission)
    score2, results = cpmp_fast_score(df, submission)

    #print results
    #max_score = df.weight.sum()
    #print('max_score = df.weight.sum() = %0.5f'%max_score)
    #print('score1= %0.5f  (%0.5f)'%(score1*max_score,score1))
    #print('score2= %0.5f  (%0.5f)'%(score2,score2/max_score))

    ## analyse the results here =============================
    d0, d1 = data.T
    track_ids = np.unique(l)
    track_ids = track_ids[track_ids != 0]
    num_track_ids = len(track_ids)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    fig.patch.set_facecolor('white')

    fig1 = plt.figure(figsize=(8, 8))
    ax1 = fig1.add_subplot(111)
    ax1 = Axes3D(fig1)
    fig1.patch.set_facecolor('white')

    def show_ax():
        ax1.set_xlabel('a', fontsize=16)
        ax1.set_ylabel('r', fontsize=16)
        ax1.set_zlabel('z', fontsize=16)
        ax.set_xlabel('a', fontsize=16)
        ax.set_ylabel('z/r', fontsize=16)
        # ax.grid()
        # ax.set_aspect('equal', 'box')

        plt.show()

    ## 0. show data:
    if False:
        ax.clear()
        ax1.clear()
        ax.plot(d0,
                d1,
                '.',
                color=[0.75, 0.75, 0.75],
                markersize=3,
                linewidth=0)
        ax1.plot(a,
                 r,
                 z,
                 '.',
                 color=[0.75, 0.75, 0.75],
                 markersize=3,
                 linewidth=0)
        show_ax()

    ## 1. show GT:
    if True:

        ax.clear()
        ax1.clear()
        ax.plot(d0,
                d1,
                '.',
                color=[0.75, 0.75, 0.75],
                markersize=3,
                linewidth=0)
        ax1.plot(a,
                 r,
                 z,
                 '.',
                 color=[0.75, 0.75, 0.75],
                 markersize=3,
                 linewidth=0)

        ax.set_title('Ground truth')
        ax1.set_title('Ground truth')

        ax1.set_xlabel('a', fontsize=16)
        ax1.set_ylabel('r', fontsize=16)
        ax1.set_zlabel('z', fontsize=16)
        ax.set_xlabel('a', fontsize=16)
        ax.set_ylabel('z/r', fontsize=16)

        for n in range(0, num_particle_ids, 1):
            particle_id = particle_ids[n]
            t = np.where(p == particle_id)[0]
            #if momentum[t[0]]<min_momentum: continue
            t = t[np.argsort(np.fabs(z[t]))]

            if np.fabs(a[t[0]] - a[t[-1]]) > 1: continue
            d = ((x[t[0]] - x[t[-1]])**2 + (y[t[0]] - y[t[-1]])**2 +
                 (z[t[0]] - z[t[-1]])**2)**0.5
            if d < 10: continue

            ###print(n, particle_id)
            color = np.random.uniform(0, 1, (3))

            #ax.clear()
            #ax1.clear()

            ax.plot(data[t, 0],
                    data[t, 1],
                    '.',
                    color=color,
                    markersize=5,
                    linewidth=0)
            ax1.plot(a[t],
                     r[t],
                     z[t],
                     '.-',
                     color=color,
                     markersize=5,
                     linewidth=1)
            #ax1.plot(a[h],r[h], z[h], 'o',  color=[0,0,0], markersize=8,  linewidth=1, mfc='none')

            #ax1.view_init(0, (ax_n*3)%360)
            #ax_n += 1

            #fig1.savefig('/root/share/project/kaggle/cern/results/yy/%05d.png'%ax_n)
            #plt.pause(0.01)
            #plt.waitforbuttonpress(-1)

        #show_ax()

    ## 2. show dbscan prediction:
    if True:
        fig_ = plt.figure(figsize=(8, 8))
        ax_ = fig_.add_subplot(111, )
        fig_.patch.set_facecolor('white')

        fig1_ = plt.figure(figsize=(8, 8))
        ax1_ = fig1_.add_subplot(111, projection='3d')
        fig1_.patch.set_facecolor('white')

        ax_.clear()
        ax1_.clear()
        ax_.plot(d0,
                 d1,
                 '.',
                 color=[0.75, 0.75, 0.75],
                 markersize=3,
                 linewidth=0)
        ax1_.plot(a,
                  r,
                  z,
                  '.',
                  color=[0.75, 0.75, 0.75],
                  markersize=3,
                  linewidth=0)

        ax.set_title('DBSCAN Prediction')
        ax1.set_title('DBSCAN Prediction')

        ax1_.set_xlabel('a', fontsize=16)
        ax1_.set_ylabel('r', fontsize=16)
        ax1_.set_zlabel('z', fontsize=16)
        ax_.set_xlabel('a', fontsize=16)
        ax_.set_ylabel('z/r', fontsize=16)

        for n in range(0, num_track_ids, 1):
            track_id = track_ids[n]
            t = np.where(l == track_id)[0]
            #if momentum[t[0]]<min_momentum: continue
            t = t[np.argsort(np.fabs(z[t]))]

            if np.fabs(a[t[0]] - a[t[-1]]) > 1: continue
            d = ((x[t[0]] - x[t[-1]])**2 + (y[t[0]] - y[t[-1]])**2 +
                 (z[t[0]] - z[t[-1]])**2)**0.5
            if d < 10: continue

            ###print(n, track_id)
            color = np.random.uniform(0, 1, (3))

            #ax.clear()
            #ax1.clear()

            ax_.plot(data[t, 0],
                     data[t, 1],
                     '.',
                     color=color,
                     markersize=5,
                     linewidth=0)
            ax1_.plot(a[t],
                      r[t],
                      z[t],
                      '.-',
                      color=color,
                      markersize=5,
                      linewidth=1)
            #ax1.plot(a[h],r[h], z[h], 'o',  color=[0,0,0], markersize=8,  linewidth=1, mfc='none')

            #ax1.view_init(0, (ax_n*3)%360)
            #ax_n += 1

            #fig1.savefig('/root/share/project/kaggle/cern/results/yy/%05d.png'%ax_n)
            #plt.pause(0.01)
            #plt.waitforbuttonpress(-1)

        #show_ax()
        #plt.show()

    ################################################################################################

    # analysis ...
    ## <to be updated> ...

    results = results.assign(
        detected=(results.count_both > results.count_particle)
        & (results.count_both > results.count_track))

    detected = results.loc[results.detected == True]
    missed = results.loc[(results.detected == False) &
                         (results.count_track < results.count_particle * 0.5)]
    fp = results.loc[(results.detected == False)
                     & (results.count_track > results.count_particle * 0.5)]

    detected = np.unique(detected.particle_id.values)
    missed = np.unique(missed.particle_id.values)
    fp = np.unique(fp.track_id.values)

    detected = detected[detected != 0]
    missed = missed[missed != 0]
    fp = fp[fp != 0]

    num_detected = len(detected)
    num_missed = len(missed)
    num_fp = len(fp)

    #shows detected tracks
    for (p, q) in [(p, detected), (p, missed), (l, fp)]:
        fig_ = plt.figure(figsize=(8, 8))
        ax_ = fig_.add_subplot(111, )
        fig_.patch.set_facecolor('white')

        fig1_ = plt.figure(figsize=(8, 8))
        ax1_ = fig1_.add_subplot(111, projection='3d')
        fig1_.patch.set_facecolor('white')

        ax_.clear()
        ax1_.clear()
        ax_.plot(d0,
                 d1,
                 '.',
                 color=[0.75, 0.75, 0.75],
                 markersize=3,
                 linewidth=0)
        ax1_.plot(a,
                  r,
                  z,
                  '.',
                  color=[0.75, 0.75, 0.75],
                  markersize=3,
                  linewidth=0)

        ax1_.set_xlabel('a', fontsize=16)
        ax1_.set_ylabel('r', fontsize=16)
        ax1_.set_zlabel('z', fontsize=16)
        ax_.set_xlabel('a', fontsize=16)
        ax_.set_ylabel('z/r', fontsize=16)

        for n in range(0, len(q), 1):
            t = np.where(p == q[n])[0]
            #if momentum[t[0]]<min_momentum: continue
            t = t[np.argsort(np.fabs(z[t]))]

            if np.fabs(a[t[0]] - a[t[-1]]) > 1: continue
            d = ((x[t[0]] - x[t[-1]])**2 + (y[t[0]] - y[t[-1]])**2 +
                 (z[t[0]] - z[t[-1]])**2)**0.5
            if d < 10: continue

            ##print(n, track_id)
            color = np.random.uniform(0, 1, (3))

            #ax.clear()
            #ax1.clear()

            ax_.plot(data[t, 0],
                     data[t, 1],
                     '.',
                     color=color,
                     markersize=5,
                     linewidth=0)
            ax1_.plot(a[t],
                      r[t],
                      z[t],
                      '.-',
                      color=color,
                      markersize=5,
                      linewidth=1)
            #plt.pause(0.01)

    plt.show()

    zz = 0

    exit(0)
Example #31
0
def make_data(
        a,
        zr,
        z,
        my_layer_id,
        p,
        # a_limit=(1.0,3.0), zr_limit=(4.0,7.0),
        a_limit=(1.0, 2.0),
        zr_limit=(4.0, 5.0),
        depth=6):
    a0, a1 = a_limit
    zr0, zr1 = zr_limit

    idx = np.where((a >= a0) & (a < a1) & (zr >= zr0) & (zr < zr1))[0]
    aa, zzr, zz = a[idx], zr[idx], z[idx] / 1000
    ll = my_layer_id[idx]
    pp = p[idx]

    data3 = np.column_stack((aa, zzr, zz))
    L = len(data3)

    pairs = []
    for d in range(depth - 1):
        i0 = np.where(ll == d)[0]
        i1 = np.where(ll == d + 1)[0]

        L0 = len(i0)
        L1 = len(i1)
        if L0 == 0: continue
        if L1 == 0: continue

        q0 = data3[i0]
        q1 = data3[i1]
        qq0 = np.repeat(q0.reshape(L0, 1, 3), L1, axis=1).reshape(-1, 3)
        qq1 = np.repeat(q1.reshape(1, L1, 3), L0, axis=0).reshape(-1, 3)
        ii0 = np.repeat(i0.reshape(L0, 1), L1, axis=1).reshape(-1, 1)
        ii1 = np.repeat(i1.reshape(1, L1), L0, axis=0).reshape(-1, 1)

        unit = qq1 - qq0
        unit = unit / np.sqrt((unit**2).sum(1, keepdims=True))
        ii = np.zeros((L0 * L1, 1), np.int32)

        pair = np.concatenate((ii0, ii1, ii, qq0, qq1, unit), 1)
        pairs.append(pair)

    P = len(pairs)
    M = 0
    for p in pairs:
        dM = len(p)
        p[:, 2] = np.arange(M, M + dM)
        M += dM

    distance = np.full((M, M), 100, np.float32)  #INF
    for d in range(P - 1):
        for a in pairs[d]:
            ai0, ai1, ai = a[:3].astype(np.int32)
            ap, aq, aunit = np.split(a[3:], 3)
            if ((np.fabs(aunit[0]) > 0.25) | (np.fabs(aunit[1]) > 0.25)):
                continue

            b = pairs[d + 1]
            i = (np.where((b[:, 0] == ai1)))[0]

            bi = (b[:, 2][i]).astype(np.int32)
            dis = np.sqrt(((b[:, -3:][i] - aunit)**2).sum(1))
            distance[ai, bi] = dis

    print('dbscan')
    _, l = dbscan(distance, eps=0.080, min_samples=1, metric='precomputed')
    cluster_id = np.unique(l + 1)
    #cluster_id = cluster_id[cluster_id!=0]
    num_cluster_id = len(cluster_id)

    ## draw clustering results -----------------------------------
    print('draw clustering results')
    pairs_flat = np.vstack(pairs)

    AX3d1.clear()
    AX3d1.scatter(aa,
                  zzr,
                  zz,
                  c=plt.cm.gnuplot(ll / depth),
                  s=16,
                  edgecolors='none')
    plot3d_particles(AX3d1,
                     aa,
                     zzr,
                     zz,
                     zz,
                     pp,
                     subsample=1,
                     color=[0, 0, 0],
                     linewidth=4)

    for id in cluster_id:
        #AX3d1.clear()
        #AX3d1.scatter(aa, zzr,  zz, c=plt.cm.gnuplot( ll/depth ), s=16, edgecolors='none')

        t = np.where(l == id)
        t0 = pairs_flat[t, 0].astype(np.int32).reshape(-1)
        t1 = pairs_flat[t, 1].astype(np.int32).reshape(-1)
        t = np.unique(np.concatenate((t0, t1)))
        #if len(t0)<3: continue

        color = np.random.uniform(0, 1, 3)
        #AX3d1.plot(aa[t0], zzr[t0],  zz[t0],'.-', color=color, markersize=15) #edgecolors=
        #AX3d1.plot(aa[t1], zzr[t1],  zz[t1],'.-', color=color, markersize=15)
        AX3d1.plot(aa[t], zzr[t], zz[t], '.-', color=color, markersize=15)
        #plt.pause(0.01)
        #plt.waitforbuttonpress(-1)
    plt.show()

    return 0
Example #32
0
def test_dbscan_badargs(args):
    # Test bad argument values: these should all raise ValueErrors
    with pytest.raises(ValueError):
        dbscan(X, **args)
Example #33
0
    def predict(self, dfh):
        print("size(dfh): {0}".format(len(dfh)))

        if ("rt" not in dfh.columns):
            dfh["rt"] = np.sqrt(dfh['x'].values**2 + dfh['y'].values**2)

        z = dfh['z'].values
        rt = dfh["rt"].values
        a0 = np.arctan2(dfh['y'].values, dfh['x'].values)
        layer_id = dfh['layer_id'].values.astype(np.float32)

        sys.stderr.write("dbscan for each (z,a) shifting\n")
        scan_labels = []
        for (dj, di) in tqdm(product(self.djs, self.dis),
                             total=len(self.djs) * len(self.dis)):
            ar = a0 + di * rt
            zr = (z + dj) / rt * 0.1
            data2 = np.column_stack([ar, zr])

            _, scan_label = dbscan(
                data2,
                eps=0.0025,
                min_samples=1,
            )
            scan_labels.append(scan_label)

        sys.stderr.write("make candidates\n")
        candidates = []
        for scan_label in tqdm(scan_labels):
            l = scan_label
            unique = np.unique(l)
            for u in unique:
                candidate = np.where(l == u)[0]
                candidates.append(candidate)

        print("# of candidates : {0}".format(len(candidates)))
        count = np.array([len(candidate) for candidate in candidates])
        sort = np.argsort(-count)
        candidates = [candidates[s] for s in sort]

        max_label = 1
        N = len(dfh)
        label = np.zeros(N, np.int32)
        count = np.zeros(N, np.int32)

        sys.stderr.write("calculate clustering label from candidates\n")
        for candidate in tqdm(candidates):
            n = candidate
            L = len(n)

            n = n[np.argsort(np.fabs(z[n]))]
            layer_id0 = layer_id[n[:-1]]
            layer_id1 = layer_id[n[1:]]
            ld = layer_id1 - layer_id0
            if np.any(ld > 2): continue

            m = count[n].max()
            if L < m: continue

            count[n] = L
            label[n] = max_label
            max_label += 1

        return label