Exemple #1
0
    def execute(self, namespace):
        from sklearn.cluster import dbscan

        inp = namespace[self.inputName]
        mapped = tabular.mappingFilter(inp)

        # Note that sklearn gives unclustered points label of -1, and first value starts at 0.
        try:
            core_samp, dbLabels = dbscan(np.vstack(
                [inp[k] for k in self.columns]).T,
                                         self.searchRadius,
                                         self.minClumpSize,
                                         n_jobs=self.numberOfJobs)
            multiproc = True
        except:
            core_samp, dbLabels = dbscan(
                np.vstack([inp[k] for k in self.columns]).T, self.searchRadius,
                self.minClumpSize)
            multiproc = False

        if multiproc:
            logger.info('using dbscan multiproc version')
        else:
            logger.info('falling back to dbscan single-threaded version')

            # shift dbscan labels up by one to match existing convention that a clumpID of 0 corresponds to unclumped
        mapped.addColumn('dbscanClumpID', dbLabels + 1)

        # propogate metadata, if present
        try:
            mapped.mdh = inp.mdh
        except AttributeError:
            pass

        namespace[self.outputName] = mapped
    def execute(self, namespace):
        from sklearn.cluster import dbscan

        inp = namespace[self.inputName]
        mapped = tabular.MappingFilter(inp)

        # Note that sklearn gives unclustered points label of -1, and first value starts at 0.
        if self.multithreaded:
            core_samp, dbLabels = dbscan(np.vstack(
                [inp[k] for k in self.columns]).T,
                                         self.searchRadius,
                                         self.minClumpSize,
                                         n_jobs=self.numberOfJobs)
        else:
            #NB try-catch from Christians multithreaded example removed as I think we should see failure here
            core_samp, dbLabels = dbscan(
                np.vstack([inp[k] for k in self.columns]).T, self.searchRadius,
                self.minClumpSize)

        # shift dbscan labels up by one to match existing convention that a clumpID of 0 corresponds to unclumped
        mapped.addColumn(str(self.clumpColumnName), dbLabels + 1)

        # propogate metadata, if present
        try:
            mapped.mdh = inp.mdh
        except AttributeError:
            pass

        namespace[self.outputName] = mapped
Exemple #3
0
def test_dbscan_sparse():
    core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X),
                                        eps=.8,
                                        min_samples=10)
    core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10)
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)
Exemple #4
0
def test_dbscan_core_samples_toy(algorithm):
    X = [[0], [2], [3], [4], [6], [8], [10]]
    n_samples = len(X)

    # Degenerate case: every sample is a core sample, either with its own
    # cluster or including other close core samples.
    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1)
    assert_array_equal(core_samples, np.arange(n_samples))
    assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])

    # With eps=1 and min_samples=2 only the 3 samples from the denser area
    # are core samples. All other points are isolated and considered noise.
    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2)
    assert_array_equal(core_samples, [1, 2, 3])
    assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])

    # Only the sample in the middle of the dense area is core. Its two
    # neighbors are edge samples. Remaining samples are noise.
    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3)
    assert_array_equal(core_samples, [2])
    assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])

    # It's no longer possible to extract core samples with eps=1:
    # everything is noise.
    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4)
    assert_array_equal(core_samples, [])
    assert_array_equal(labels, np.full(n_samples, -1.))
Exemple #5
0
def test_boundaries():
    # ensure min_samples is inclusive of core point
    core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
    assert 0 in core
    # ensure eps is inclusive of circumference
    core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
    assert 0 in core
    core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
    assert 0 not in core
Exemple #6
0
def test_dbscan_input_not_modified(use_sparse, metric):
    # test that the input is not modified by dbscan
    X = np.random.RandomState(0).rand(10, 10)
    X = sparse.csr_matrix(X) if use_sparse else X
    X_copy = X.copy()
    dbscan(X, metric=metric)

    if use_sparse:
        assert_array_equal(X.toarray(), X_copy.toarray())
    else:
        assert_array_equal(X, X_copy)
Exemple #7
0
def test_dbscan_sparse_precomputed(include_self):
    D = pairwise_distances(X)
    nn = NearestNeighbors(radius=0.9).fit(X)
    X_ = X if include_self else None
    D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance")
    # Ensure it is sparse not merely on diagonals:
    assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
    core_sparse, labels_sparse = dbscan(
        D_sparse, eps=0.8, min_samples=10, metric="precomputed"
    )
    core_dense, labels_dense = dbscan(
        D, eps=0.8, min_samples=10, metric="precomputed"
    )
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse)
Exemple #8
0
def test_dbscan_sparse_precomputed_different_eps():
    # test that precomputed neighbors graph is filtered if computed with
    # a radius larger than DBSCAN's eps.
    lower_eps = 0.2
    nn = NearestNeighbors(radius=lower_eps).fit(X)
    D_sparse = nn.radius_neighbors_graph(X, mode='distance')
    dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric='precomputed')

    higher_eps = lower_eps + 0.7
    nn = NearestNeighbors(radius=higher_eps).fit(X)
    D_sparse = nn.radius_neighbors_graph(X, mode='distance')
    dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric='precomputed')

    assert_array_equal(dbscan_lower[0], dbscan_higher[0])
    assert_array_equal(dbscan_lower[1], dbscan_higher[1])
def run_dbscan(geodata: pd.DataFrame, eps: int, minpts: int) -> pd.DataFrame:
    """
    Used to actually run the DBSCAN Algorithm, using the user-supplied Epsilon
    """

    eps = eps / 1000

    kms_per_radian = 6371.0088
    epsilon = eps / kms_per_radian
    minsamples = minpts
    radians = np.radians(geodata[['x_coordinate', 'y_coordinate']])

    # DBSCAN
    preds = dbscan(radians,
                   eps=epsilon,
                   min_samples=minsamples,
                   algorithm='ball_tree',
                   metric='haversine')[1]
    dbscan_coords = np.append(radians, preds.reshape(-1, 1), axis=1)
    pd.DataFrame(dbscan_coords).plot(x=1,
                                     y=0,
                                     kind="scatter",
                                     c=2,
                                     colorbar=True,
                                     title="DBSCAN (eps= 15m, min_points=5)",
                                     marker="+",
                                     colormap="tab20b")

    geodata['Cluster'] = pd.DataFrame(dbscan_coords)[2]

    return geodata
def get_cluster_assignments(radius_meter: float, min_measures: int, coordinates: List[List[float]], weights):
    km_radian = 6871.0088 # Conversion: kilometers per radian
    epsilon = radius_meter/1000/km_radian
    #return DBSCAN(metric='haversine', algorithm='ball_tree', eps=epsilon, min_samples=min_measures).fit(
    #    radians(coordinates)).labels_
    return dbscan(radians(coordinates), eps=epsilon, min_samples=min_measures, metric='haversine', algorithm='ball_tree',
        sample_weight=weights)[1] # returns: tuple(core_samples, labels)
Exemple #11
0
def smooth_eye_position(eye_position, threshold=2):
    x_pos, y_pos = eye_position[:, 0], eye_position[:, 1]

    X = np.stack((x_pos, y_pos, np.linspace(0, len(x_pos) / 2, len(x_pos)))).T
    clusters = cluster.dbscan(X,
                              eps=threshold,
                              min_samples=3,
                              metric='minkowski',
                              p=2)

    move_events = np.where(clusters[1][1:] > clusters[1][:-1])[0] + 1

    len_chunks = [move_events[0]] + list(move_events[1:] - move_events[:-1])
    len_chunks.append(len(x_pos) - move_events[-1])

    eye_x_positions = np.split(x_pos, move_events)
    eye_y_positions = np.split(y_pos, move_events)

    mean_x_pos = np.array(list(map(np.mean, eye_x_positions)))
    mean_y_pos = np.array(list(map(np.mean, eye_y_positions)))

    x_pos_smooth = np.concatenate(
        [[x_pos] * len_chunk
         for x_pos, len_chunk in zip(mean_x_pos, len_chunks)])
    y_pos_smooth = np.concatenate(
        [[y_pos] * len_chunk
         for y_pos, len_chunk in zip(mean_y_pos, len_chunks)])

    return np.stack((x_pos_smooth, y_pos_smooth)).T
Exemple #12
0
    def homog_lev_series(obj, eps=eps, min_samples=min_samples):
        name = obj.name

        original = obj.copy()
        obj = obj.drop_duplicates()
        data = obj.tolist()

        def lev_metric(x, y):
            i, j = int(x[0]), int(y[0])
            return levenshtein(data[i], data[j])

        X = np.arange(len(data)).reshape(-1, 1)
        labels = dbscan(X, metric=lev_metric, eps=eps,
                        min_samples=min_samples)[1]

        x = pd.DataFrame({
            'A': obj.reset_index(drop=True),
            'B': pd.Series(labels)
        })
        y = x.drop_duplicates('B')
        y = y[~(y.B == -1)]
        y.columns = ['C', 'B']
        x = x.merge(y, on='B', how='left')
        x['C'] = np.where(x.C.isnull(), x.A, x.C)

        results = pd.DataFrame({'A': original})
        results = results.merge(x[['A', 'C']], on='A', how='left')
        out = results.C.rename(name)

        return out
def run_dbscan(geodata: pd.DataFrame) -> pd.DataFrame:

    kms_per_radian = 6371.0088
    epsilon = 0.015 / kms_per_radian
    minsamples = 5
    radians = np.radians(geodata[['x_coordinate', 'y_coordinate']])

    # DBSCAN
    preds = dbscan(radians,
                   eps=epsilon,
                   min_samples=minsamples,
                   algorithm='ball_tree',
                   metric='haversine')[1]
    dbscan_coords = np.append(radians, preds.reshape(-1, 1), axis=1)
    pd.DataFrame(dbscan_coords).plot(x=1,
                                     y=0,
                                     kind="scatter",
                                     c=2,
                                     colorbar=True,
                                     title="DBSCAN (eps= 15m, min_points=5)",
                                     marker="+",
                                     colormap="tab20b")

    geodata['Cluster'] = pd.DataFrame(dbscan_coords)[2]

    return geodata
Exemple #14
0
def categoryAnalysis(count=1000,
                     dbScanCount=100,
                     buckets=100,
                     startEps=100,
                     samples=3):
    matrix, counter = loadMatrixPickle(count, buckets)
    sys.exit()

    #run Dbscan
    start = time.time()

    db = dbscan(matrix.matrix[:dbScanCount],
                eps=startEps,
                algorithm='kd_tree',
                min_samples=samples,
                n_jobs=-1)
    print(
        "DBScan: eps= %.3f, min_samples=%d, %d clusters generated, %.2f%% noise, %d articles"
        % (startEps, samples, len(set(db[1])),
           100.0 * list(db[1]).count(-1) / count, len(matrix.matrix)))
    print("DBScan time: %.2fs" % (time.time() - start))

    start = time.time()

    totalDb = fastCluster(dbScanCount, db, matrix.matrix)
    print("fastCluster time: %.2fs" % (time.time() - start))

    start = time.time()

    clusterCategoryCounter, totalCategoryCounter = getCategoryAppearanceRates(
        matrix, totalDb, counter)
    findHighCategoryRates(clusterCategoryCounter, totalCategoryCounter,
                          totalDb, count, counter)

    print("Category analysing time: %.2fs" % (time.time() - start))
Exemple #15
0
def split_eye_events(eye_tracking, eps=2):
    """
    Split the record where the eye moves. Detection done with clustering on X,Y and time of the eye position.

    params:
        - eye_tracking: Eye traking array of the ellipse fit, in shape (t, (x,y,width,height,angle))
        - eps: Distance to detect eye movements. Adjust this parameter if results are not satisfying
        - kind: kind of interpolation in {'linear', 'cubic', 'quintic'}
    return:
        - move_indexes, blink_indexes, noise_indexes
    """
    x_pos = np.array(eye_tracking[:, 0])

    X = np.stack((x_pos, np.linspace(0, len(x_pos), len(x_pos)) * .5)).T
    clusters = cluster.dbscan(X,
                              eps=eps,
                              min_samples=5,
                              metric='minkowski',
                              p=2)
    move_indexes = np.where(clusters[1][1:] > clusters[1][:-1])[0] + 1

    noise_indexes = np.where(clusters[1] == -1)[0]
    blink_indexes = np.where(x_pos == 0)[0]

    return move_indexes, blink_indexes, noise_indexes
Exemple #16
0
def post_process_y(y, eps=0.06, min_samples=2):
    # no postprocessing
    if min_samples > y.shape[1]:
        return y
    for i in range(y.shape[0]):
        row = y[i]
        row = row.reshape(-1, 1)
        _, labels = dbscan(row,
                           eps=eps,
                           min_samples=min_samples,
                           metric='euclidean')
        minusonecluster = sum(labels == -1)
        clusters = len(np.unique(labels))
        print(row, labels)
        # one of the unique clusters is -1
        # these are clusters by themselves
        # and we do not touch them(we are
        # confident about the result)
        if minusonecluster > 0:
            clusters = clusters - 1
        for j in range(clusters):
            indices = (labels == j)
            # we are not sure about the
            # ranking of the elements of
            # this cluster. We prefer to
            # assign them equal probability
            row[indices] = np.mean(row[indices])
        row = row.reshape(1, -1)
        y[i] = row
    return y
Exemple #17
0
def size_hist(parts, params, eps=1.0, sp=0):
    """
  Finds clusters in the list of particles

  Parameters
  ----------
  parts
    a list of particle objects to be clustered
  params
    a dict of configuration values
  eps
    the separation distance to use for identifying clusters
  sp
    the specie to identify clusters in
  """
    # Extract the position vectors
    D = np.zeros((len(parts), 3))
    ps = 0
    for p in range(len(parts)):
        # Check if the particle is of the desired specie
        if sp == 0 or parts[p].sp == sp:
            D[ps] = parts[p].x
            ps += 1
    # Truncate zeros if we didn't cluster everything
    D = D[:ps]

    [core, labels] = dbscan(D, eps=eps, min_samples=1)
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print('Found', n_clusters_, 'clusters')
    # Sizes of each cluster
    cluster_sizes = np.bincount(labels)
    # Number of clusters for given size
    size_hist = np.bincount(cluster_sizes)
    return size_hist[1:]
Exemple #18
0
def get_dbscan_clusters_mask(mask, N, coins_hsv):
    # Generate data points for clustering
    data_points = np.empty(shape=(N, 3))
    for i in range(0, N):
        local_mask = np.where(mask == i, 255, 0)
        local_mask = local_mask.astype(np.uint8)

        (m1, m2, m3,
         _) = cv2.mean(coins_hsv,
                       local_mask)  # hsv dava podobri rezultati od lab

        data_points[i] = [m1, m2, m3]

    core_samples, labels = dbscan(data_points, 16.5, 1)

    for x in range(0, mask.shape[0]):
        for y in range(0, mask.shape[1]):
            label = labels[mask[x, y]]

            if label == -1:  # noise is background
                label = 0

            mask[x, y] = label

    mask = np.where(mask == 0, 0, 255)

    mask = mask.astype(np.uint8)

    print("There are {} clusters".format(np.max(labels)))

    return mask
Exemple #19
0
def dbscan_labels(pointcloud, epsilon, minpoints, rgb_weight=0, algorithm="ball_tree"):
    """
    Find an array of point-labels of clusters found by the DBSCAN algorithm.

    Parameters
    ----------
    pointcloud : pcl.PointCloud
        Input pointcloud.
    epsilon : float
        Neighborhood radius for DBSCAN.
    minpoints : integer
        Minimum neighborhood density for DBSCAN.
    rgb_weight : float, optional
        If non-zero, cluster on color information as well as location;
        specifies the relative weight of the RGB components to spatial
        coordinates in distance computations.
        (RGB values have wildly different scales than spatial coordinates.)

    Returns
    -------
    labels : Sequence
        A sequence of labels per point. Label -1 indicates a point does not
        belong to any cluster, other labels indicate the cluster number a
        point belongs to.
    """

    if rgb_weight > 0:
        X = pointcloud.to_array()
        X[:, 3:] *= rgb_weight
    else:
        X = pointcloud

    _, labels = dbscan(X, eps=epsilon, min_samples=minpoints, algorithm=algorithm)
    return np.asarray(labels)
Exemple #20
0
def dbscan(threshold, matrix, taxa, revert=False, min_samples=1):
    """
    Compute DBSCAN cluster analysis.
    """
    if not taxa:
        taxa = list(range(1, len(matrix) + 1))

    core_samples, labels = cluster.dbscan(matrix,
                                          eps=threshold,
                                          min_samples=min_samples,
                                          metric='precomputed')

    # change to our internal cluster style
    idx = max(labels) + 1
    if idx == 0: idx += 1
    for i, c in enumerate(labels):
        if c == -1:
            labels[i] = idx
            idx += 1

    # check for revert
    if revert:
        return dict(zip(range(len(taxa)), labels))

    # return stuff
    clr = {}
    for i, t in enumerate(taxa):
        try:
            clr[labels[i]] += [t]
        except KeyError:
            clr[clusters[i]] = [t]

    return clr
Exemple #21
0
def size_hist(parts, params, eps=1.0, sp=0):
  """
  Finds clusters in the list of particles

  Parameters
  ----------
  parts
    a list of particle objects to be clustered
  params
    a dict of configuration values
  eps
    the separation distance to use for identifying clusters
  sp
    the specie to identify clusters in
  """
  # Extract the position vectors
  D = np.zeros((len(parts),3))
  ps = 0
  for p in range(len(parts)):
    # Check if the particle is of the desired specie
    if sp == 0 or parts[p].sp == sp:
      D[ps] = parts[p].x 
      ps += 1
  # Truncate zeros if we didn't cluster everything
  D = D[:ps]

  [core, labels] =  dbscan( D, eps=eps, min_samples=1)
  n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  print('Found',n_clusters_,'clusters')
  # Sizes of each cluster
  cluster_sizes = np.bincount(labels)
  # Number of clusters for given size
  size_hist = np.bincount( cluster_sizes )
  return size_hist[1:]
    def step(self, state, a, o):
        """
            state should be new agent internal state after taking
            action a and observing observation o.
            state should contain probability distribution of next observation.
        """
        try:
            if state.type() == 'torch.FloatTensor':
                state = state.detach().numpy()
        except AttributeError:
            pass
        if self.h and len(self.s) > self.h_len:
            self.h = self.h[1:]
            self.s = self.s[1:]
            self.s_labels = self.s_labels[1:]
        self.s = np.append(self.s, [state], axis=0)
        self.h.append([a, o])
        self.s_labels = dbscan([state.flatten() for state in self.s])[1]
        self.observed = np.zeros((max(self.s_labels) + 1, self.a, self.o))
        for i in range(max(self.s_labels) + 1):
            state_indices = [j for j in range(len(self.s)) if self.s_labels[j] == i]
            clust_mean = np.mean([self.s[j] for j in state_indices], axis=0)
            self.observed[i] = clust_mean
            self.update_act()
            self.update_exp()

        self.calc_chisquare()
Exemple #23
0
def test_dbscan_callable():
    # Tests the DBSCAN algorithm with a callable metric.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    # metric is the function reference, not the string key.
    metric = distance.euclidean
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X,
                                  metric=metric,
                                  eps=eps,
                                  min_samples=min_samples,
                                  algorithm='ball_tree')

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_1 == n_clusters

    db = DBSCAN(metric=metric,
                eps=eps,
                min_samples=min_samples,
                algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters
Exemple #24
0
    def do_clustering(self, coeffs):
        """
        Do DBSCAN clustering on the corrections.

        :param coeffs: Triplet of distance coefficients, corresponding to the sensitivity of the clustering to point
            separation along 1) x-axis (time), 2) y-axis (correction) and 3) slope (drift rate)
        :type coeffs: tuple(float, float, float)
        :return: Results of sklearn.cluster.dbscan (refer to third party documentation)
        """
        sec_per_week = 7 * 24 * 3600

        def _temporalDist2DSlope(p0, p1, coeffs):
            return math.sqrt((coeffs[0] * (p1[0] - p0[0]))**2 +
                             (coeffs[1] * sec_per_week * (p1[1] - p0[1]))**2 +
                             (coeffs[2] * sec_per_week * sec_per_week *
                              (p1[2] - p0[2]))**2)

        data = np.column_stack(
            (self.correction_times_clean, self.corrections_clean,
             self.corrections_slope))
        ind, ids = dbscan(
            data,
            eps=2 * sec_per_week,
            min_samples=7,
            metric=lambda p0, p1: _temporalDist2DSlope(p0, p1, coeffs))
        return ind, ids
Exemple #25
0
def segment_by_dbscan(binary_img: np.ndarray,
                      eps: float = 5.0,
                      min_samples: int = 10) -> List[np.ndarray]:
    """Use DBSCAN clustering to segment binary image.

    Parameters
    ----------
    binary_img: np.ndarray
        binary image, a 2D array containing 0s and 1s (obtaind by thresholding
        original image converted to grayscale).
    eps: float
        the epsilon parameter of DBSCAN.
    min_samples: int
        minimum number of pixels each cluster (object) must contain in order to
        be considered a valid object.

    Returns
    -------
    list
        List of coordinate arrays where the n-th entry is the array of
        positions of the pixels belonging to the n-th segmented object.
    """
    indices = np.nonzero(binary_img)
    if len(indices[0]) == 0:
        return []
    xy = np.vstack((indices[1], indices[0])).T
    core, labels = cluster.dbscan(xy,
                                  eps=eps,
                                  min_samples=min_samples,
                                  metric='euclidean',
                                  algorithm='auto')
    unique_labels = set(labels)
    unique_labels.discard(-1)  # -1 is the noise label
    return [xy[labels == label] for label in sorted(unique_labels)]
Exemple #26
0
 def cluster(self, x: List[T]) -> List[int]:
     # NB(nkansal96): The selection of `eps` here is arbitrary and most likely wrong. Until this is fixed,
     #   you should use `AffinityCluster`
     _, mapping = dbscan(self.get_distance_matrix(x),
                         eps=2,
                         min_samples=0,
                         metric="precomputed")
     return mapping
Exemple #27
0
def get_clusters(X, sent_coll, eps=DEFAULT_EPS):
    db = dbscan(X, eps=eps, min_samples=3, metric='cosine', 
                algorithm='brute')[1]

    d = defaultdict(list)
    for i in range(len(db)):
        d[db[i]].append(sent_coll[i])
    return d
Exemple #28
0
def get_clusters(X, sent_coll, eps=DEFAULT_EPS, min_samp=3):
    from scipy.optimize import minimize_scalar
    new_eps = minimize_scalar(lambda x: -len(set(dbscan(X,
                                                        eps=x,
                                                        min_samples=min_samp,
                                                        metric='cosine',
                                                        algorithm='brute')[1])),
                              method='bounded',
                              bounds=[0, 1]).x

    db = dbscan(X, eps=new_eps, min_samples=min_samp, metric='cosine', 
                algorithm='brute')[1]

    d = defaultdict(list)
    for i in range(len(db)):
        d[db[i]].append(sent_coll[i])
    return d, new_eps
Exemple #29
0
def get_dbscan_data(mdf, eps, npts):
    np.random.seed(42)
    res = dbscan(mdf[["LatRad", "LonRad"]].values,
                 eps=eps * 1e-5,
                 min_samples=npts,
                 metric='haversine')
    df3 = mdf.copy()
    df3["cluster"] = res[1]
    return df3
Exemple #30
0
def DBSCAN_clust(d, words, epsilon):
    if VERBOSE:
        print 'Running DBSCAN!'
    core, labels = dbscan(d, eps=epsilon, metric='precomputed')
    cluster_assignments = labels
    nclust = max(cluster_assignments)
    assignments = pd.DataFrame({'word':words, 'cluster':labels})
    csizes, indices = eval_assignments(assignments, nclust, None)
    return assignments, csizes, indices
Exemple #31
0
def cluster_into_spots(df, init_eps=150, levels=2, threshold=0.1):
    start_points = list()
    end_points = list()
    length = len(df)

    for i in range(length):
        start_points.append([df['start_lat'].iloc[i], df['start_lon'].iloc[i]])
        end_points.append([df['end_lat'].iloc[i], df['end_lon'].iloc[i]])

    points = np.radians(np.vstack([start_points, end_points]))

    haversine = DistanceMetric.get_metric('haversine')
    dist = haversine.pairwise(points) * R

    clusters = dbscan(dist, metric='precomputed', min_samples=1,
                      eps=init_eps)[1]
    clusters = np.array(clusters, dtype=np.object)

    for _ in range(levels):
        init_eps = init_eps * 0.5
        counts = dict(Counter(clusters))
        for key in counts:
            if counts[key] > threshold * length:
                idxs = np.where(clusters == key)[0]
                dist = haversine.pairwise(points[idxs]) * R
                inner_clusters = dbscan(dist,
                                        metric='precomputed',
                                        min_samples=1,
                                        eps=init_eps)[1]
                for i, idx in enumerate(idxs):
                    clusters[idx] = "{}_{}".format(clusters[idx],
                                                   inner_clusters[i])

    start_clusters = list()
    end_clusters = list()
    for i, cluster in enumerate(clusters):
        if i < length:
            start_clusters.append(clusters[i])
        else:
            end_clusters.append(clusters[i % length + length])

    df['start_cluster'] = start_clusters
    df['end_cluster'] = end_clusters
    return df
Exemple #32
0
 def run(self, src):
   x = self.table(src)
   settings = self.settings
   print(settings)
   cl = dbscan(eps=settings.eps, min_samples=settings.min_pts, algorithm="brute")
   y = cl.fit_predict(x)
   clusters = {}
   for x_i, y_i in zip(x, y):
     clusters[y_i] = clusters.get(y_i, []) + [x_i]
   return clusters
def mergeKeypoints(keypoints, eps):
    if len(keypoints) < 2:
        return keypoints
    points = np.array([keypoint.pt for keypoint in keypoints])
    sizes = np.array([keypoint.size for keypoint in keypoints])
    # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html
    _, pointsLabels = dbscan(points, eps = eps, min_samples = 1, metric = 'euclidean')
    clustersPredicates = ((pointsLabels == label) for label in range(0, max(pointsLabels) + 1))
    mergedKeypoints = [createCentroid(points[predicate], sizes[predicate], eps) for predicate in clustersPredicates]
    return mergedKeypoints
Exemple #34
0
    def test_dbscan(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.cluster.dbscan()
        expected = cluster.dbscan(iris.data)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assertTrue(isinstance(result[1], pdml.ModelSeries))
        self.assert_index_equal(result[1].index, df.index)
        self.assert_numpy_array_equal(result[1].values, expected[1])
Exemple #35
0
 def run(self, src):
     x = self.table(src)
     settings = self.settings
     print(settings)
     cl = dbscan(eps=settings.eps,
                 min_samples=settings.min_pts,
                 algorithm="brute")
     y = cl.fit_predict(x)
     clusters = {}
     for x_i, y_i in zip(x, y):
         clusters[y_i] = clusters.get(y_i, []) + [x_i]
     return clusters
Exemple #36
0
    def test_dbscan(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.cluster.dbscan()
        expected = cluster.dbscan(iris.data)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assertIsInstance(result[1], pdml.ModelSeries)
        tm.assert_index_equal(result[1].index, df.index)
        tm.assert_numpy_array_equal(result[1].values, expected[1])
Exemple #37
0
def dbscan_func(data_seg):
    point_cloud = data_seg[0]
    eps = data_seg[1]
    min_samples = data_seg[2]
    metric = data_seg[3]
    algorithm = data_seg[4]
    i_slice = data_seg[5]
    num_points = len(point_cloud)
    print("Starting to cluster slice {} with a total of {} points".format(i_slice, num_points))
    oldtime = time()
    result = dbscan(point_cloud, eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm)
    print("Finished clustering slice {}. Time for slice: {} sec".format(i_slice, time() - oldtime))
    core_sample_indices = result[0]
    labels = result[1]
    return core_sample_indices, labels, i_slice
Exemple #38
0
def Dbscan(embeddings, id_word, word_id, eps, min_size):
  coreSamples, labels = dbscan(embeddings, eps, min_size)
  # group clusters
  clusters = {}
  for i, label in enumerate(labels):
    if label not in clusters:
      clusters[label] = []
    clusters[label].append(id_word[i].encode('utf-8'))
  # output
  print(len(clusters) - 1)
  for c in clusters.iterkeys():
    if c < 0: continue          # -1 is noise
    print(' '.join([str(x) for x in embeddings[int(c)]]))
  print()
  # show clusters
  for c, words in clusters.iteritems():
    print(c, ' '.join(words))
Exemple #39
0
def dbscan(
        threshold,
        matrix,
        taxa,
        revert = False,
        min_samples = 1
        ):
    """
    Compute DBSCAN cluster analysis.
    """
    if not taxa:
        taxa = list(range(1,len(matrix)+1))

    core_samples,labels = cluster.dbscan(
            matrix,
            eps=threshold,
            min_samples = min_samples,
            metric = 'precomputed'
            )

    # change to our internal cluster style
    idx = max(labels)+1
    if idx == 0: idx += 1
    for i,c in enumerate(labels):
        if c == -1:
            labels[i] = idx
            idx += 1

    # check for revert
    if revert:
        return dict(
                zip(
                    range(len(taxa)),
                    labels
                    )
                )

    # return stuff 
    clr = {}
    for i,t in enumerate(taxa):
        try:
            clr[labels[i]] += [t]
        except KeyError:
            clr[clusters[i]] = [t]

    return clr
Exemple #40
0
def _cluster_core(sort_list, r, visited, final_list):

    from sklearn.cluster import dbscan
    from scipy.spatial.distance import euclidean

    pos = np.r_[[i[1] for i in sort_list]]
    if len(pos) >= 2:
        _, labels = dbscan(pos, eps=r, min_samples=2)
        pool = set()
        for i, p in enumerate(sort_list):
            if p[1] in pool:
                continue
            c = labels[i]
            if c==-1:
                continue
            sub = pos[labels==c]
            cen = p[1]
            rad = r
            Local = [p[1]]
            ini = -1
            while len(sub):
                out = []
                for q in sub:
                    if tuple(q) in pool:
                        continue
                    tmp = euclidean(q, cen)
                    if tmp<=rad:
                        Local.append(tuple(q))
                    else:
                        out.append(tuple(q))
                if len(out)==ini:
                    break
                ini = len(out)
                tmp = np.r_[Local]
                # assign centroid to a certain pixel
                cen = tuple(tmp.mean(axis=0).round().astype(int))
                rad = np.int(np.round(max([euclidean(cen,q) for q in Local]))) + r
                sub = np.r_[out]
            for q in Local:
                pool.add(q)
            final_list.append((p[1], cen, rad))
        
        visited.update(pool)
Exemple #41
0
def run_cluster(complPG, qfib, qsym,
                cl_radius=cl_radius, min_compl=min_compl):
    """
    """
    start = time.clock()                      # time this

    # # use transforms module for distance
    # quatDistance = lambda x, y: xf.quat_distance(x, y, qsym)

    # use compiled module for distance
    # just to be safe, must order qsym as C-contiguous
    qsym  = np.array(qsym.T, order='C').T
    quatDistance = lambda x, y: xfcapi.quat_distance(np.array(x, order='C'), \
                                                     np.array(y, order='C'), \
                                                     qsym)

    qfib_r = qfib[:, np.r_[complPG] > min_compl]

    print "Feeding %d orientations above %.1f%% to clustering" % (qfib_r.shape[1], 100*min_compl)

    if haveScikit:
        print "Using scikit..."
        pdist = pairwise_distances(qfib_r.T, metric=quatDistance, n_jobs=-1)
        core_samples, labels = dbscan(pdist, eps=d2r*cl_radius, min_samples=1, metric='precomputed')
        cl = np.array(labels, dtype=int) + 1
    else:
        print "Using fclusterdata with a tolerance of %f degrees..." % (cl_radius)
        cl = cluster.hierarchy.fclusterdata(qfib_r.T, d2r*cl_radius, criterion='distance', metric=quatDistance)

    nblobs = len(np.unique(cl))

    qbar = np.zeros((4, nblobs))
    for i in range(nblobs):
        npts = sum(cl == i + 1)
        # qbar[:, i] = mutil.unitVector(
        #     np.sum(qfib_r[:, cl == i + 1].reshape(4, npts), axis=1).reshape(4, 1)).flatten()
        qbar[:, i] = rot.quatAverage(qfib_r[:, cl == i + 1].reshape(4, npts),
                                     qsym).flatten()
    elapsed = (time.clock() - start)

    print "clustering took %f seconds" % (elapsed)
    return qbar, cl
Exemple #42
0
def vel_hist(parts, params, eps=1.0, sp=0):
  # Extract the position vectors
  D = np.zeros(( len(parts),3) )
  # Particle velocities
  V = np.zeros( (len(parts),3 ) )
  ps = 0
  for p in range(len(parts)):
    # Check if the particle is of the desired specie
    if sp == 0 or parts[p].sp == sp:
      D[ps] = parts[p].x 
      V[ps] = parts[p].v
      ps += 1
  # Truncate zeros if we didn't cluster everything
  D = D[:ps]
  # Make clusters based on position
  [core, labels] =  dbscan( D, eps=eps, min_samples=1)
  n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  # The net velocities of each cluster
  cluster_vels = np.zeros( (n_clusters_, 3) );
  
  # Iterate all particles
  for p in range(ps):
    if labels[p] >= 0: # Make sure it was clustered
      # Add particle velocity to its cluster's net velocity
      cluster_vels[ labels[p] ] = np.add( cluster_vels[ labels[p] ], V[p] )

  # Magnitudes of cluster velocities
  normed_vels = np.apply_along_axis( np.linalg.norm, 1, cluster_vels )

  cluster_sizes = np.bincount(labels)
  # Number of clusters for given size
  size_hist = np.bincount( cluster_sizes )
  # Speed as a function of size
  speed_size = np.zeros( len(size_hist) )
  # Iterate through clusters
  for c in range(n_clusters_):
    speed_size[ cluster_sizes[c] ] = np.add(speed_size[ cluster_sizes[c] ], normed_vels[c])

  # Average
  speed_size = np.divide(speed_size[1:], size_hist[1:])

  return speed_size
Exemple #43
0
def run_cluster(compl, qfib, qsym, cfg, min_samples=None, compl_thresh=None, radius=None):
    """
    """
    algorithm = cfg.find_orientations.clustering.algorithm

    # check for override on completeness threshold
    if compl_thresh is not None:
        min_compl = compl_thresh

    # check for override on radius
    if radius is not None:
        cl_radius = radius

    start = time.clock() # time this

    num_above = sum(np.array(compl) > min_compl)
    if num_above == 0:
        # nothing to cluster
        qbar = cl = np.array([])
    elif num_above == 1:
        # short circuit
        qbar = qfib[:, np.array(compl) > min_compl]
        cl = [1]
    else:
        # use compiled module for distance
        # just to be safe, must order qsym as C-contiguous
        qsym  = np.array(qsym.T, order='C').T
        def quat_distance(x, y):
            return xfcapi.quat_distance(np.array(x, order='C'), np.array(y, order='C'), qsym)

        qfib_r = qfib[:, np.array(compl) > min_compl]

        num_ors = qfib_r.shape[1]

        if num_ors > 25000:
            if algorithm == 'sph-dbscan' or algorithm == 'fclusterdata':
                logger.info("falling back to euclidean DBSCAN")
                algorithm = 'ort-dbscan'
            #raise RuntimeError, \
            #    "Requested clustering of %d orientations, which would be too slow!" %qfib_r.shape[1]

        logger.info(
            "Feeding %d orientations above %.1f%% to clustering",
            num_ors, 100*min_compl
            )

        if algorithm == 'dbscan' and not have_sklearn:
            algorithm = 'fclusterdata'
            logger.warning(
                "sklearn >= 0.14 required for dbscan; using fclusterdata"
                )

        if algorithm == 'dbscan' or algorithm == 'ort-dbscan' or algorithm == 'sph-dbscan':
            # munge min_samples according to options
            if min_samples is None or cfg.find_orientations.use_quaternion_grid is not None:
                min_samples = 1

            if algorithm == 'sph-dbscan':
                logger.info("using spherical DBSCAN")
                # compute distance matrix
                pdist = pairwise_distances(
                    qfib_r.T, metric=quat_distance, n_jobs=1
                    )

                # run dbscan
                core_samples, labels = dbscan(
                    pdist,
                    eps=np.radians(cl_radius),
                    min_samples=min_samples,
                    metric='precomputed'
                    )
            else:
                if algorithm == 'ort-dbscan':
                    logger.info("using euclidean orthographic DBSCAN")
                    pts = qfib_r[1:, :].T
                    eps = 0.25*np.radians(cl_radius)
                else:
                    logger.info("using euclidean DBSCAN")
                    pts = qfib_r.T
                    eps = 0.5*np.radians(cl_radius)

                # run dbscan
                core_samples, labels = dbscan(
                    pts,
                    eps=eps,
                    min_samples=min_samples,
                    metric='minkowski', p=2,
                    )

            # extract cluster labels
            cl = np.array(labels, dtype=int) # convert to array
            noise_points = cl == -1 # index for marking noise
            cl += 1 # move index to 1-based instead of 0
            cl[noise_points] = -1 # re-mark noise as -1
            logger.info("dbscan found %d noise points", sum(noise_points))
        elif algorithm == 'fclusterdata':
            logger.info("using spherical fclusetrdata")
            cl = cluster.hierarchy.fclusterdata(
                qfib_r.T,
                np.radians(cl_radius),
                criterion='distance',
                metric=quat_distance
                )
        else:
            raise RuntimeError(
                "Clustering algorithm %s not recognized" % algorithm
                )

        # extract number of clusters
        if np.any(cl == -1):
            nblobs = len(np.unique(cl)) - 1
        else:
            nblobs = len(np.unique(cl))

        """ PERFORM AVERAGING TO GET CLUSTER CENTROIDS """
        qbar = np.zeros((4, nblobs))
        for i in range(nblobs):
            npts = sum(cl == i + 1)
            qbar[:, i] = rot.quatAverageCluster(
                qfib_r[:, cl == i + 1].reshape(4, npts), qsym
            ).flatten()
            pass
        pass

    if (algorithm == 'dbscan' or algorithm == 'ort-dbscan') \
      and qbar.size/4 > 1:
        logger.info("\tchecking for duplicate orientations...")
        cl = cluster.hierarchy.fclusterdata(
            qbar.T,
            np.radians(cl_radius),
            criterion='distance',
            metric=quat_distance)
        nblobs_new = len(np.unique(cl))
        if nblobs_new < nblobs:
            logger.info("\tfound %d duplicates within %f degrees" \
                        %(nblobs-nblobs_new, cl_radius))
            tmp = np.zeros((4, nblobs_new))
            for i in range(nblobs_new):
                npts = sum(cl == i + 1)
                tmp[:, i] = rot.quatAverageCluster(
                    qbar[:, cl == i + 1].reshape(4, npts), qsym
                ).flatten()
                pass
            qbar = tmp
            pass
        pass

    logger.info("clustering took %f seconds", time.clock() - start)
    logger.info(
        "Found %d orientation clusters with >=%.1f%% completeness"
        " and %2f misorientation",
        qbar.size/4,
        100.*min_compl,
        cl_radius
        )

    return np.atleast_2d(qbar), cl
def clusterData(X):
	original=X
	X = StandardScaler().fit_transform(X)
	coreSamples, labels = dbscan(X, min_samples=5, eps=.07, p=4)
	#return original, labels
	return rearrangeLabels(original, labels)
Exemple #45
0

from Levenshtein import * 
  

import numpy as np

from sklearn.cluster import dbscan
data = ["ACCTCCTAGAAG", "ACCTACTAGAAGTT", "GAATATTAGGCCGA"]
def lev_metric(x, y):
     i, j = int(x[0]), int(y[0])     # extract indices
     return levenshtein(data[i], data[j])

X = np.arange(len(data)).reshape(-1, 1)
print(X)
#array([[0],
#       [1],
#       [2]])
dbscan(X, metric=lev_metric, eps=5, min_samples=2) 
#([0, 1], array([ 0,  0, -1]))
Exemple #46
0
def build_overlap_table(cfg, tol_mult=0.5):

    icfg = get_instrument_parameters(cfg)

    gt = np.loadtxt(
        os.path.join(cfg.analysis_dir, 'grains.out')
    )

    ngrains = len(gt)

    mat_list = cPickle.load(open(cfg.material.definitions, 'r'))
    mat_names = [mat_list[i].name for i in range(len(mat_list))]
    mat_dict = dict(zip(mat_names, mat_list))

    matl = mat_dict[cfg.material.active]

    pd = matl.planeData
    pd.exclusions = np.zeros(len(pd.exclusions), dtype=bool)
    pd.tThMax = np.radians(cfg.fit_grains.tth_max)
    pd.tThWidth = np.radians(cfg.fit_grains.tolerance.tth[-1])

    # for clustering...
    eps = tol_mult*np.radians(
        min(
            min(cfg.fit_grains.tolerance.eta),
            2*min(cfg.fit_grains.tolerance.omega)
        )
    )

    # merged two-theta indices
    tth_ranges_merged = pd.getMergedRanges()[0]
    pids = []
    for hklids in tth_ranges_merged:
        pids.append(
            [pd.hklDataList[hklids[i]]['hklID'] for i in range(len(hklids))]
        )

    # Make table of unit diffraction vectors
    st = []
    for i in range(ngrains):
        this_st = np.loadtxt(
            os.path.join(cfg.analysis_dir, 'spots_%05d.out' %i)
            )
        #... do all predicted?
        valid_spt = this_st[:, 0] >= 0
        #valid_spt = np.ones(len(this_st), dtype=bool)

        angs = this_st[valid_spt, 7:10]

        dvec = xfcapi.anglesToDVec(
            angs,
            chi=icfg['oscillation_stage']['chi']
        )

        # [ grainID, reflID, hklID, D_s[0], D_s[1], D_s[2], tth, eta, ome ]
        st.append(
            np.hstack([
                i*np.ones((sum(valid_spt), 1)),
                this_st[valid_spt, :2],
                dvec,
                angs,
            ])
        )

    # make overlap table
    # [[range_0], [range_1], ..., [range_n]]
    # range_0 = [grainIDs, reflIDs, hklIDs] that are within tol
    overlap_table = []
    ii = 0
    for pid in pids:
        print "processing ring set %d" %ii
        start0 = time.clock()
        tmp = []; a = []; b = []; c = []
        for j in range(len(pid)):
            a.append(
                np.vstack(
                    [st[i][st[i][:, 2] == pid[j], 3:6] for i in range(len(st))]
                )
            )
            b.append(
                np.vstack(
                    [st[i][st[i][:, 2] == pid[j], 0:3] for i in range(len(st))]
                )
            )
            c.append(
                np.vstack(
                    [st[i][st[i][:, 2] == pid[j], 6:9] for i in range(len(st))]
                )
            )
            pass
        a = np.vstack(a)  # unit diffraction vectors in sample frame
        b = np.vstack(b)  # [grainID, reflID, hklID]
        c = np.vstack(c)  # predicted angles [tth, eta, ome]
        if len(a) > 0:
            # run dbscan
            core_samples, labels = dbscan(
                a,
                eps=eps,
                min_samples=2,
                metric='minkowski', p=2,
            )
            cl, nblobs = postprocess_dbscan(labels)
            elapsed0 = time.clock() - start0
            print "\tdbscan took %.2f seconds" % elapsed0
            # import pdb; pdb.set_trace()
            print "\tcollapsing incidentals for %d candidates..." %nblobs
            start1 = time.clock()                      # time this
            for i in range(1, nblobs+1):
                # put in check on omega here
                these_angs = c[np.where(cl == i)[0], :]
                # local_cl = cluster.hierarchy.fclusterdata(
                #     these_angs[:, 1:],
                #     eps,
                #     criterion='distance',
                #     metric=adist
                #     )
                # local_nblobs = len(np.unique(local_cl))
                _, local_labels = dbscan(
                    these_angs[:, 1:],
                    eps=eps,
                    min_samples=2,
                    metric=adist,
                    n_jobs=-1,
                )
                local_cl, local_nblobs = postprocess_dbscan(local_labels)

                if local_nblobs < len(these_angs):
                    for j in range(1, local_nblobs + 1):
                        npts = sum(local_cl == j)
                        if npts >= 2:
                            cl_idx = np.where(local_cl == j)[0]
                            #import pdb; pdb.set_trace()
                            tmp.append(
                                b[np.where(cl == i)[0][cl_idx], :]
                            )
            elapsed1 = time.clock() - start1
            print "\tomega filtering took %.2f seconds" %elapsed1
        ii += 1
        overlap_table.append(tmp)
    return overlap_table
Exemple #47
0
def run_cluster(compl, qfib, qsym, cfg):
    """
    """
    cl_radius = cfg.find_orientations.clustering.radius
    min_compl = cfg.find_orientations.clustering.completeness
    algorithm = cfg.find_orientations.clustering.algorithm

    start = time.clock() # time this

    num_above = sum(np.array(compl) > min_compl)
    if num_above == 0:
        # nothing to cluster
        qbar = cl = np.array([])
    elif num_above == 1:
        # short circuit
        qbar = qfib[:, np.array(compl) > min_compl]
        cl = [1]
    else:
        # use compiled module for distance
        # just to be safe, must order qsym as C-contiguous
        qsym  = np.array(qsym.T, order='C').T
        quat_distance = lambda x, y: xfcapi.quat_distance(
            np.array(x, order='C'),
            np.array(y, order='C'),
            qsym
            )

        qfib_r = qfib[:, np.array(compl) > min_compl]

        logger.info(
            "Feeding %d orientations above %.1f%% to clustering",
            qfib_r.shape[1], 100*min_compl
            )

        if algorithm == 'dbscan' and not have_sklearn:
            algorithm = 'fclusterdata'
            logger.warning(
                "sklearn >= 0.14 required for dbscan, using fclusterdata"
                )
        if algorithm == 'dbscan':
            pdist = pairwise_distances(
                qfib_r.T, metric=quat_distance, n_jobs=-1
                )
            core_samples, labels = dbscan(
                pdist,
                eps=np.radians(cl_radius),
                min_samples=1,
                metric='precomputed'
                )
            cl = np.array(labels, dtype=int) + 1
        elif algorithm == 'fclusterdata':
            cl = cluster.hierarchy.fclusterdata(
                qfib_r.T,
                np.radians(cl_radius),
                criterion='distance',
                metric=quat_distance
                )
        else:
            raise RuntimeError(
                "Clustering algorithm %s not recognized" % algorithm
                )

        nblobs = len(np.unique(cl))

        qbar = np.zeros((4, nblobs))
        for i in range(nblobs):
            npts = sum(cl == i + 1)
            qbar[:, i] = rot.quatAverage(
                qfib_r[:, cl == i + 1].reshape(4, npts), qsym
                ).flatten()

    logger.info("clustering took %f seconds", time.clock() - start)
    logger.info(
        "Found %d orientation clusters with >=%.1f%% completeness"
        " and %2f misorientation",
        qbar.size/4,
        100.*min_compl,
        cl_radius
        )

    return np.atleast_2d(qbar), cl
Exemple #48
0
def build_overlap_table(cfg, tol_mult=0.5):
    
    icfg = get_instrument_parameters(cfg)
    
    gt = np.loadtxt(
        os.path.join(cfg.analysis_dir, 'grains.out')
    )
    
    ngrains = len(gt)
    
    mat_list = cPickle.load(open(cfg.material.definitions, 'r'))
    mat_names = [mat_list[i].name for i in range(len(mat_list))]
    mat_dict = dict(zip(mat_names, mat_list))
    
    matl = mat_dict[cfg.material.active]
    
    pd = matl.planeData
    pd.exclusions = np.zeros(len(pd.exclusions), dtype=bool)
    pd.tThMax = np.radians(cfg.fit_grains.tth_max)
    pd.tThWidth = np.radians(cfg.fit_grains.tolerance.tth[-1])
    
    # for clustering...
    eps = tol_mult*np.radians(
        min(
            min(cfg.fit_grains.tolerance.eta), 
            2*min(cfg.fit_grains.tolerance.omega)
        )
    )

    # merged two-theta indices
    tth_ranges_merged = pd.getMergedRanges()[0]
    pids = []
    for hklids in tth_ranges_merged:
        pids.append(
            [pd.hklDataList[hklids[i]]['hklID'] for i in range(len(hklids))]
        )
        
    # Make table of unit diffraction vectors
    st = []
    for i in range(ngrains):
        this_st = np.loadtxt(
            os.path.join(cfg.analysis_dir, 'spots_%05d.out' %i)
            )
        #... do all predicted?
        valid_spt = this_st[:, 0] >= 0
        #valid_spt = np.ones(len(this_st), dtype=bool)

        angs = this_st[valid_spt, 7:10]

        dvec = xfcapi.anglesToDVec(
            angs, 
            chi=icfg['oscillation_stage']['chi']
        )

        # [ grainID, reflID, hklID, D_s[0], D_s[1], D_s[2], tth, eta, ome ]
        st.append(
            np.hstack([
                i*np.ones((sum(valid_spt), 1)),
                this_st[valid_spt, :2], 
                dvec, 
                angs,
            ])
        )

    # make overlap table
    # [[range_0], [range_1], ..., [range_n]]
    # range_0 = [grainIDs, reflIDs, hklIDs] that are within tol
    overlap_table = []
    ii = 0
    for pid in pids:
        tmp = []; a = []; b = []; c = []
        for j in range(len(pid)):
            a.append(
                np.vstack(
                    [st[i][st[i][:, 2] == pid[j], 3:6] for i in range(len(st))]
                )
            )
            b.append(
                np.vstack(
                    [st[i][st[i][:, 2] == pid[j], 0:3] for i in range(len(st))]
                )
            )
            c.append(
                np.vstack(
                    [st[i][st[i][:, 2] == pid[j], 6:9] for i in range(len(st))]
                )
            )
            pass
        a = np.vstack(a)
        b = np.vstack(b)
        c = np.vstack(c)    
        if len(a) > 0:
            # run dbscan
            core_samples, labels = dbscan(
                a,
                eps=eps,
                min_samples=2,
                metric='minkowski', p=2,
            )
            
            cl = np.array(labels, dtype=int) # convert to array
            noise_points = cl == -1 # index for marking noise
            cl += 1 # move index to 1-based instead of 0
            cl[noise_points] = -1 # re-mark noise as -1
            
            # extract number of clusters
            if np.any(cl == -1):
                nblobs = len(np.unique(cl)) - 1
            else:
                nblobs = len(np.unique(cl))
            
            for i in range(1, nblobs+1):
                # put in check on omega here
                these_angs = c[np.where(cl == i)[0], :]
                local_cl = cluster.hierarchy.fclusterdata(
                    these_angs[:, 1:],
                    eps,
                    criterion='distance',
                    metric=adist
                    )
                local_nblobs = len(np.unique(local_cl))
                if local_nblobs < len(these_angs):
                    for j in range(1, local_nblobs + 1):
                        npts = sum(local_cl == j)
                        if npts >= 2:
                            cl_idx = np.where(local_cl == j)[0]
                            #import pdb; pdb.set_trace()
                            tmp.append(
                                b[np.where(cl == i)[0][cl_idx], :]
                            )
        print "processing ring set %d" %ii
        ii += 1
        overlap_table.append(tmp)
    return overlap_table
Exemple #49
0
def clustering(lats, longs, timestamps, ID, timestmp, multiPDF=False):
    """
    Clusters the GPS coordinates using DBSCAN
    :param timestmp:                The timestamp
    :param ID:                      The ID
    :param timestamps:              The timestamps of the GPS coordinates
    :param lats:                    The latitudes
    :param longs:                   The longitudes
    :return:                        The rounded distance
    """
    folder = "out/"
    plotDir = folder + "plots/Walking Test Analysis"

    R = 6371  # Radius of the earth in km
    cartesianX = []
    cartesianY = []
    cartesianZ = []

    for lat, long in zip(lats, longs):
        # Convert to cartesian coordinates
        x = R * cos(lat) * cos(long)
        y = R * cos(lat) * sin(long)
        z = R * sin(lat)
        cartesianX.append(x)
        cartesianY.append(y)
        cartesianZ.append(z)

    combined = np.vstack((cartesianX, cartesianY, cartesianZ)).T
    (core_samples, labels) = dbscan(combined, eps=0.5)
    grouped = zip(labels, core_samples)
    nonGroupedPositions = []

    for (label, core_sample) in grouped:
        if label != -1:
            lat = lats[core_sample]
            long = longs[core_sample]
            stamp = timestamps[core_sample]
            nonGroupedPositions.append((lat, long, stamp))

    if len(nonGroupedPositions) > 0:
        y = zip(*nonGroupedPositions)[0]  # the latitudes
        x = zip(*nonGroupedPositions)[1]  # the longitudes
        t = zip(*nonGroupedPositions)[2]  # the timestamps
        x2, y2, newx2, newy2 = smooth(y, x, t)

        plt.plot(y2, x2, label="Linear Interpolation")
        plt.plot(newy2, newx2, label="Savgol Filter", color="r")
        distance = calcDistanceWalked(newy2, newx2)
        grouped = sorted(grouped, key=itemgetter(0))

        clusters = {}
        labels = []
        for key, group in groupby(grouped, key=itemgetter(0)):
            # group the clusters based on their label
            labels.append(key)
            clusters[key] = [el[1] for el in group]

        noise = False
        colors = plt.get_cmap("Spectral")(np.linspace(0, 1, len(clusters)))
        for label in labels:
            indices = clusters[label]
            latitudes = []
            longitudes = []
            size = 10
            alpha = 0.5
            lineWidth = 0.15
            for i in indices:
                latitudes.append(lats[i])
                longitudes.append(longs[i])
            if label == -1:
                # outliers are identified with a label of -1
                plt.plot(latitudes, longitudes, "o", markerfacecolor=almost_black, markeredgecolor=almost_black,
                         markersize=size, alpha=alpha, linewidth=lineWidth, label="Outlier")
                noise = True
            else:
                plt.plot(latitudes, longitudes, "o", markerfacecolor=colors[label], markeredgecolor=almost_black,
                         markersize=size, alpha=alpha, linewidth=lineWidth, label="Cluster %i" % (label + 1))

        plt.title("Timestamp: %s\n Number of clusters: %i\n Calculated distance: %i meters" % (
            timestmp, (len(clusters) - 1) if noise else len(clusters), round(distance)))
        plt.xlabel("Latitude")
        plt.ylabel("Longitude")
        fancyPlot()
        writeToPdf(ID, plotDir)
        return True, distance
    else:
        # DBSCAN gave back an empty array, therefore we cannot perform any smoothing or distance calculation
        return False, 0
Exemple #50
0
res
res[0]
res.keys()
len(res['content'])
res['content'].keys()
res['content']['statuses'].keys()
len(res['content']['statuses'])
res['content']['statuses'][0]
res['content']['statuses'][0]['text']
res['content']['statuses'][1]['text']
res['content']['statuses'][2]['text']
newtext = "\n".join(x['text'] for x in res['content']['statuses'])
len(newtext)
newtext[:100]
newtext[:600]
print(newtext[:600])
import summsnippets as summ2
tok =summ2.tokenizes(newtext)
tups = summ2.pos_tag(tok)
tups[:10]
tups[:20]
a = summ.make_sent_objs(tups)
X = summ.build_sent_matrix(a)
db = dbscan(X, eps=summ.DEFAULT_EPS, min_samples=3, metric='cosine', algorithm='brute')[1]
from sklearn.cluster import dbscan
db = dbscan(X, eps=summ2.DEFAULT_EPS, min_samples=3, metric='cosine', algorithm='brute')[1]
db
from collections import Countern
from collections import Counter
Counter(db)
Exemple #51
0
import numpy as np
from sklearn import cluster

#excercise 7.3.4
k=3
samples=np.array([
    (4,10),(7,10),(4,8),(6,8),(3,4),(10,5),(12,6),(11,4),(2,2),(5,2),(9,3),(12,3),
        ])
res = cluster.k_means(samples, k)
labels=res[1]
clus = [samples[labels==i] for i in range(k)]
print 'cluster i, N, SUM, SUMSQ'
for i in range(k):
    print i, [(clus[i]**j).sum(axis=0) for j in range(3)]
    for j in range(2):
        t=clus[i][:,j]
        print np.var(t), np.std(t)

#density based scan
print cluster.dbscan(samples, eps=3,min_samples=2)
Exemple #52
0
def dbscan(
        threshold,
        matrix,
        taxa,
        revert=False,
        min_samples=1):
    """
    Compute DBSCAN cluster analysis.

    Parameters
    ----------
    threshold : float
        The threshold for clustering you want to use.
    matrix : list
        The two-dimensional matrix passed as list or array.
    taxa : list
        The list of taxon names. If set to "False" a fake list of taxon names
        will be created, giving a positive numerical ID in increasing order for
        each column in the matrix.
    revert : bool
        If set to "False", don't return taxon names but simply the language
        identifiers and their labels as a dictionary. Otherwise returns a
        dictionary with labels as keys and list of taxon names as values.
    min_samples : int (default=1)
        The minimal samples parameter of the DBCSCAN method from the SKLEARN
        package.

    Returns
    -------
    clusters : dict
        Either a dictionary of taxon identifiers and labels, or a dictionary of
        labels and taxon names.

    Notes
    -----
    This method does not work as expected, probably since it normally requires
    distances between points as input. We list it only for completeness here,
    but urge to be careful when using the code and checking properly our
    implementation in the source code.

    Requires the scikitlearn package, downloadable from http://scikit-learn.org/.
    """
    if not cluster:
        raise ValueError("The package sklearn is needed to run this analysis.")

    if not taxa:
        taxa = list(range(1, len(matrix) + 1))

    core_samples, labels = cluster.dbscan(
        matrix, eps=threshold, min_samples=min_samples, metric='precomputed')

    # change to our internal cluster style
    idx = max(labels) + 1
    if idx == 0:
        idx += 1
    for i, c in enumerate(labels):
        if c == -1:
            labels[i] = idx
            idx += 1

    # check for revert
    if revert:
        return dict(zip(range(len(taxa)), labels))

    clr = defaultdict(list)
    for i, t in enumerate(taxa):
        clr[labels[i]] += [t]
    return clr