Example #1
0
def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = ignore_warnings(LSHForest, category=DeprecationWarning)(
        min_hash_match=0, n_candidates=n_points, random_state=42).fit(X)

    # define a query aligned with the first axis
    query = [[1., 0.]]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slightly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
class NNLR:

    def __init__(self, k=5, rad=2, mode='k', feat_names=None):
        self.mode = mode
        self.k = k
        self.NN = NearestNeighbors(k, radius=rad)
        
    def fit(self, X, Y):
        self.X = X
        self.Y = Y
        self.NN.fit(X)
        self.active=defaultdict(int)
    def nn_lin(self, testX, neighbors):
        l = DecisionTreeRegressor()
        return np.mean(self.Y[neighbors])
        l.fit(self.X[neighbors], self.Y[neighbors])
        # for idx in np.where(l.coef_)[0]:
            # self.active[idx]+=1
        return l.predict([testX])[0]

    def predict(self, X):
        if self.mode == 'k':
            neighbors = self.NN.kneighbors(X)[1]
        elif self.mode == 'rad':
            neighbors = self.NN.radius_neighbors(X)[1]
        return np.array([self.nn_lin(Xtst, nbr) for (Xtst, nbr) in zip(X, neighbors)])
Example #3
0
def _wpca_analysis(L, C, intensities):
    """
    Determine the eccentricity of each cluster using weighted PCA (See
    Jolliffe 2002, 14.2.1). The smallest normalized explained variance
    is small for flat of filiform objects.

    - L is a numpy matrix (one point on each row)
    - intensities are gray levels of each point

    No cluster assignment is used here: a ball of radius 10 around each
    center is used to find the cloud of points.
    """
    np.set_printoptions(threshold=50000)
    n_points, n_features = L.shape
    tee.log('WPCA - Fitting NearestNeighbors on', n_points, 'points')
    nbrs = NearestNeighbors(radius=10.0).fit(L)
    for i, c in enumerate(C):
        array_c = np.array([c.x, c.y, c.z])
        i_nbrs = nbrs.radius_neighbors([array_c], 10.0, return_distance=False)[0]
        points_within = L[i_nbrs]
        if len(points_within) < 64:  # too small set, there is no point in running PCA
            c.EVR = [0.499, 0.499, 0.002]
            c.last_variance = c.EVR[2]
        else:
            w = np.sqrt(intensities[i_nbrs]/255.0)
            wX = np.dot(np.diag(w), points_within)
            pca = sklearn.decomposition.PCA(n_components=3)
            X_r = pca.fit(wX).transform(wX)
            c.EVR = pca.explained_variance_ratio_
            c.last_variance = c.EVR[2]
        print('WPCA done on', i, '/', len(C), 'name=', c.name, 'EVR=', c.EVR)
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1,))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries,))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries,))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
Example #5
0
def mean_shift(X, bandwidth, n_seeds, kernel_function='gaussian', max_iterations=100, proximity_thresh=5):
    '''
    ---Parameters---
    X : data in form (samples, dims)
    bandwidth : radius of nearest neighbors
    n_seeds : 
    kernel_update_function : can be "gaussian" or "flat" or your own kernel
    proximity_thresh : minimum distance (in pixels) a new cluster must be away from previous ones

    ---Returns---
    cluster_centers : 
    cluster_counts : how many pixels are with the neighborhood of each cluster
    '''

    import numpy as np
    from sklearn.neighbors import BallTree, NearestNeighbors
    from sklearn.utils import extmath
    from sklearn.metrics.pairwise import euclidean_distances
    from collections import defaultdict 

    if kernel_function == 'gaussian':
        kernel_update_function = gaussian_kernel
    elif kernel_function == 'flat':
        kernel_update_function = flat_kernel
    else:
        kernel_update_function = kernel_function


    n_points, n_features = X.shape
    stop_thresh = 1e-2 * bandwidth # when mean has converged                                                                                                               
    cluster_centers = []
    cluster_counts = [] 
    # ball_tree = BallTree(X)# to efficiently look up nearby points
    neighbors = NearestNeighbors(radius=bandwidth).fit(X)

    seeds = X[(np.random.uniform(0,X.shape[0], n_seeds)).astype(np.int)]
 
    # For each seed, climb gradient until convergence or max_iterations                                                                                                     
    for weighted_mean in seeds:
         completed_iterations = 0
         while True:
             points_within = X[neighbors.radius_neighbors([weighted_mean], bandwidth, return_distance=False)[0]]
             old_mean = weighted_mean  # save the old mean                                                                                                                  
             weighted_mean = kernel_update_function(old_mean, points_within, bandwidth)
             converged = extmath.norm(weighted_mean - old_mean) < stop_thresh
             if converged or completed_iterations == max_iterations:
                # Only add cluster if it's different enough from other centers
                if len(cluster_centers) > 0:
                    diff_from_prev = [np.linalg.norm(weighted_mean-cluster_centers[i], 2) for i in range(len(cluster_centers))]
                    if np.min(diff_from_prev) > proximity_thresh:
                        cluster_centers.append(weighted_mean)
                        cluster_counts.append(points_within.shape[0])
                else:
                    cluster_centers.append(weighted_mean)
                    cluster_counts.append(points_within.shape[0])
                break
             completed_iterations += 1
 
    return cluster_centers, cluster_counts
def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
                                     np.sort(distances_approx[0]))))
Example #7
0
 def compute(self):
     nn = NearestNeighbors(radius=self.eps, algorithm='auto', metric=self.metric).fit(self.x)
     self.distances, self.indices = nn.radius_neighbors(self.x, self.eps)
     print self.distances.shape, self.indices.shape
     for i in xrange(self.n):
         if not self.processed[i]:
             self.expand_cluster_order(i)
     assert self.ordered_file_index == self.n
     # print self.ordered_file
     self.draw_reachability_plot()
     return self.ordered_file
    def build(self,tweets,minimalTermPerTweet=5, remove_noise_with_poisson_Law=False) :
        """
        Return an upper sparse triangular matrix of similarity j>i
        """
        timeThreshold=float(self.timeThreshold)
        distanceThreshold=float(self.distanceThreshold)
        useOnlyHashtags=self.useOnlyHashtags
        numberOfTweets=len(tweets)

        M=dok_matrix((numberOfTweets, numberOfTweets),dtype=np.float)
        print "      Calculating TF-IDF vectors ..."
        TFIDFVectors,TweetPerTermMap=getTweetsTFIDFVectorAndNorm(tweets, minimalTermPerTweet=minimalTermPerTweet, remove_noise_with_poisson_Law=remove_noise_with_poisson_Law,useOnlyHashtags=useOnlyHashtags)
        print "      Constructing similarity matrix ..."

        distanceThresholdInDegree=distanceThreshold/DEG_LATITUDE_IN_METER
        spatialIndex=NearestNeighbors(radius=distanceThresholdInDegree, algorithm='auto')
        spatialIndex.fit(np.array([(tweet.position.latitude,tweet.position.longitude) for tweet in tweets]))

        SHOW_RATE=100

        for i in range(numberOfTweets) :
            if (i%SHOW_RATE==0) : print "\t",i,";"
            
            tweetI,TFIDFVectorI=tweets[i],TFIDFVectors[i]
            neighboors=set()
            
            #Recuperation des voisins par mots (les tweets ayant au moins un term en commun)
            TFIDFVectorIKeySet=set(TFIDFVectorI)
            for term in TFIDFVectorIKeySet : neighboors|=TweetPerTermMap[term]

            #Recuperation des voisins en espace (les tweets dans le voisinage self.distanceThreshold)
            position=np.array([tweetI.position.latitude,tweetI.position.longitude]).reshape(-1,2)
            neighboors&=set(spatialIndex.radius_neighbors(position)[1][0])

            for j in neighboors :
                tweetJ=tweets[j]

                """
                Ignorer les tweets qui ne sont pas apres le tweetI
                Ignorer les tweets qui ne sont pas dans le voisinage temporelle du tweetI
                """
                if (j<=i or tweetJ.delay(tweetI)>self.timeThreshold) : continue
                
                TFIDFVectorJ=TFIDFVectors[j]
                TFIDFVectorJKeySet=set(TFIDFVectorJ)
                keysIntersection=TFIDFVectorIKeySet & TFIDFVectorJKeySet
                similarity=0
                for term in keysIntersection : similarity+=TFIDFVectorI[term]*TFIDFVectorJ[term]
                M[i,j]=similarity

        return coo_matrix(M)
Example #9
0
def getSim_dense(day, centroids, dataset, thred_radius_dist, vdw, rel_dw):
    print "## Begin calculating centroid dataset sim.", len(centroids), dataset.shape
    dataset_vdw = dataset[range(vdw[0], vdw[1]),:]

    if 1:
        nnModel = NearestNeighbors(radius=thred_radius_dist, algorithm='brute', metric='minkowski', p=2, n_jobs=1)
        num_centroids = len(centroids)
        #allData = np.append(centroids, dataset, axis=0)
        nnModel.fit(dataset)
        ngIdxArray = nnModel.radius_neighbors(centroids, thred_radius_dist, return_distance=False)
    if 0:
        ngIdxArray = []
        for vecId, vec in enumerate(centroids):#.reshape(1, -1).tolist()
            distArr = euclidean_distances(np.array([vec]), dataset_vdw)
            nn_keys = [i+vdw[0] for i, eu in enumerate(distArr[0]) if eu <= thred_radius_dist]
            ngIdxArray.append(np.asarray(nn_keys, dtype=np.int32))
        ngIdxArray = np.asarray(ngIdxArray)

    print "## nn cal completed", time.asctime()
    return ngIdxArray
Example #10
0
def _finalize_masses(X, C, intensities):
    """
    Regardless of the parameters of the algorithm, place a ball of
    radius 10 around each center and compute the mass in the ball.
    Rationale: thresholding for discriminating between centers and non
    centers should not depend on parameters used to seek the centers.
    This mass will be later used for the recall-precision curve.
    Hopefully wild variations of performance across different
    substacks will be reduced this way.
    """
    n_points, n_features = X.shape
    tee.log('Finalizing masses - Fitting NearestNeighbors on', n_points, 'points')
    nbrs = NearestNeighbors(radius=10.0).fit(X)

    for c in C:
        array_c = np.array([c.x, c.y, c.z])
        i_nbrs = nbrs.radius_neighbors([array_c], 10.0, return_distance=False)[0]
        points_within = X[i_nbrs]
        if len(points_within) == 0:
            break
        c.mass = sum(intensities[i_nbrs])
Example #11
0
File: mi.py Project: bacalfa/mifs
def _mi_dc(x, y, k):
    """
    Calculates the mututal information between a continuous vector x and a
    disrete class vector y.

    This implementation can calculate the MI between the joint distribution of
    one or more continuous variables (X[:, 1:3]) with a discrete variable (y).

    Thanks to Adam Pocock, the author of the FEAST package for the idea.

    Brian C. Ross, 2014, PLOS ONE
    Mutual Information between Discrete and Continuous Data Sets
    """

    y = y.flatten()
    n = x.shape[0]
    classes = np.unique(y)
    knn = NearestNeighbors(n_neighbors=k)
    # distance to kth in-class neighbour
    d2k = np.empty(n)
    # number of points within each point's class
    Nx = []
    for yi in y:
        Nx.append(np.sum(y == yi))

    # find the distance of the kth in-class point
    for c in classes:
        mask = np.where(y == c)[0]
        knn.fit(x[mask, :])
        d2k[mask] = knn.kneighbors()[0][:, -1]

    # find the number of points within the distance of the kth in-class point
    knn.fit(x)
    m = knn.radius_neighbors(radius=d2k, return_distance=False)
    m = [i.shape[0] for i in m]

    # calculate MI based on Equation 2 in Ross 2014
    MI = psi(n) - np.mean(psi(Nx)) + psi(k) - np.mean(psi(m))
    return MI
Example #12
0
class NNR(object):
    def __init__(self, r=1.0, k=10, def_mean=0, def_sd=float('inf'), off=0.1):
        self.reg = KRadiusNeighborsRegressor(
            n_neighbors=k, radius=r, defval=def_mean, weights=self.comp_weights)
        # Regresses squared error
        self.err_reg = NearestNeighbors(n_neighbors=k, radius=r)
        self.off = off
        self.def_sd = float(def_sd)

    def comp_weights(self, dists):
        return 1.0 / (dists + self.off)

    def fit(self, X, y):
        self.reg.fit(X, y)
        errs = y - self.reg.predict(X)
        self.sse = errs * errs
        self.err_reg.fit(X)

    def predict(self, X, return_std=False):
        if not return_std:
            return self.reg.predict(X)
        else:
            mean_val = self.reg.predict(X)

            sds = []
            dists, inds = self.err_reg.radius_neighbors(
                X, return_distance=True)
            for d, i in zip(dists, inds):
                if len(d) < 2:
                    sds.append(self.def_sd)
                    continue
                else:
                    errs = self.sse[i]
                    weights = self.comp_weights(d)
                    sd = np.average(errs, axis=0, weights=weights) / len(d)
                    sds.append(sd)

            return mean_val, np.array(sds)
Example #13
0
def dbscan(X, eps, min_samples, mode, visualize,metric='minkowski',
           algorithm='auto', leaf_size=30, p=2, random_state=None):
    """Perform DBSCAN clustering from vector array or distance matrix.

Parameters
----------
X: array [n_samples, n_samples] or [n_samples, n_features]
Array of distances between samples, or a feature array.
The array is treated as a feature array unless the metric is given as
'precomputed'.

eps: float, optional
The maximum distance between two samples for them to be considered
as in the same neighborhood.

min_samples: int, optional
The number of samples in a neighborhood for a point to be considered
as a core point.

metric: string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by metrics.pairwise.calculate_distance for its
metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square.

algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
See NearestNeighbors module documentation for details.

leaf_size: int, optional (default = 30)
Leaf size passed to BallTree or cKDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.

p: float, optional
The power of the Minkowski metric to be used to calculate distance
between points.

random_state: numpy.RandomState, optional
The generator used to initialize the centers. Defaults to numpy.random.

Returns
-------
core_samples: array [n_core_samples]
Indices of core samples.

labels : array [n_samples]
Cluster labels for each point. Noisy samples are given the label -1.

Notes
-----
See examples/cluster/plot_dbscan.py for an example.

References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, “A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise”.
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226–231. 1996
"""
    if not eps > 0.0:
        raise ValueError("eps must be positive.")

    X = np.asarray(X)
    
    n = X.shape[0]

    # If index order not given, create random order.
    random_state = check_random_state(random_state)
    index_order = np.arange(n)
    random_state.shuffle(index_order)

    # check for known metric powers
    distance_matrix = True
    if metric == 'precomputed':
        D = pairwise_distances(X, metric=metric)
    else:
        distance_matrix = False
        neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm,
                                           leaf_size=leaf_size,
                                           metric=metric, p=p)
        neighbors_model.fit(X)

    # Calculate neighborhood for all samples. This leaves the original point
    # in, which needs to be considered later (i.e. point i is the
    # neighborhood of point i. While True, its useless information)
    neighborhoods = []
    if distance_matrix:
        neighborhoods = [np.where(x <= eps)[0] for x in D]

    # Initially, all samples are noise.
    labels = -np.ones(n)

    # A list of all core samples found.
    core_samples = []

    # label_num is the label given to the new cluster
    label_num = 0

    # Look at all samples and determine if they are core.
    # If they are then build a new cluster from them.
    for index in index_order:
        # Already classified
        if labels[index] != -1:
            continue

        # get neighbors from neighborhoods or ballTree
        index_neighborhood = []
        if distance_matrix:
            index_neighborhood = neighborhoods[index]
        else:
            index_neighborhood = neighbors_model.radius_neighbors(
                X[index], eps, return_distance=False)[0]

        # Too few samples to be core
        if len(index_neighborhood) < min_samples:
            continue

        core_samples.append(index)
        labels[index] = label_num
        # candidates for new core samples in the cluster.
        candidates = [index]

        while len(candidates) > 0:
            new_candidates = []
            # A candidate is a core point in the current cluster that has
            # not yet been used to expand the current cluster.
            for c in candidates:
                c_neighborhood = []
                if distance_matrix:
                    c_neighborhood = neighborhoods[c]
                else:
                    c_neighborhood = neighbors_model.radius_neighbors(
                        X[c], eps, return_distance=False)[0]
                noise = np.where(labels[c_neighborhood] == -1)[0]
                noise = c_neighborhood[noise]
                labels[noise] = label_num
                for neighbor in noise:
                    n_neighborhood = []
                    if distance_matrix:
                        n_neighborhood = neighborhoods[neighbor]
                    else:
                        n_neighborhood = neighbors_model.radius_neighbors(
                            X[neighbor], eps, return_distance=False)[0]
                    # check if its a core point as well
                    if len(n_neighborhood) >= min_samples:
                        # is new core point
                        new_candidates.append(neighbor)
                        core_samples.append(neighbor)
            # Update candidates for next round of cluster expansion.
            candidates = new_candidates
        # Current cluster finished.
        # Next core point found will start a new cluster.
        label_num += 1

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print('Estimated number of clusters: %d' % n_clusters_)
   # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))

##############################################################################
# Plot result
# Black removed and is used for noise instead.
    human = np.zeros((1, len(labels)),int)    #declare all points 0
    unique_labels = set(labels)
    surface=[]
    #colors = pl.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    #ccnames=['grey', 'black', 'violet', 'blue', 'cyan', 'rosy', 'orange', 'red', 'green', 'brown', 'yellow', 'gold']
    ccnames =['blue','green','red','cyan','magenta','yellow','black','white','grey']
    cc = ['b','g','r','c','m','y','k','w','0.75' ]
    
    [xi,yi,zi] = [X[:,0] , X[:,1] , X[:,2]]
    
    [xmin, xmax] = [min(xi), max(xi)]
    [ymin, ymax] = [min(yi), max(yi)]
    [zmin, zmax] = [min(zi), max(zi)]
    [xnodes, ynodes, znodes] = [np.linspace(xmin, xmax, 20, endpoint=True), np.linspace(ymin, ymax, 20, endpoint=True), np.linspace(zmin, zmax, 20, endpoint=True)]
    #cc = [190 190 190, 0 0 0, 138 43 226, 0 0 255, 0 255 255, 255 193 193,   255 127 0, 255 0 0,0 255 0, 139 69 19, 255 255 0, 139    117    0]./255;
    for k, col in zip(unique_labels, cc):
        if k == -1:
            # Black used for noise.
            col = 'k'
            markersize = 6
        class_members = [index[0] for index in np.argwhere(labels == k)]
        
        cluster_core_samples = [index for index in core_samples
                                if labels[index] == k]
        
        for index in class_members:
            x = X[index]
            if index in core_samples and k != -1:
                markersize = 10
            else:
                markersize = 6  
             
            
            pl.plot(x[1], x[2], 'o', markerfacecolor=col,markeredgecolor='k', markersize=markersize)
            
    pl.title('Estimated number of clusters: %d' % n_clusters_)
    pl.show() #plot figure
    #MANUALLY ANNOTATE DATA
    
    obj = 0
    for obj in range(0,n_clusters_):
        filter=np.where(labels[:]==obj)[0]

        if mode==0:
            rospy.loginfo('Is %s human (1 for yes, 0 for no): ', ccnames[obj])
            temp=input()
            for i in filter:
                human[0,i] = temp   

        surface= my_griddata.griddata(yi[filter], zi[filter], xi[filter], ynodes, znodes,'nn')
        
        #surface=surface-min(min(surface))
        rospy.loginfo('extract surface for cluster %d', obj)
             
        obj = obj + 1
    pl.close()
    return core_samples, labels, n_clusters_, human, surface
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query,
                                          radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1, ))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries, return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries, ))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries, ))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)

    # Radius-based queries do not sort the result points and the order
    # depends on the method, the random_state and the dataset order. Therefore
    # we need to sort the results ourselves before performing any comparison.
    sorted_dists_exact = np.sort(distances_exact[0])
    sorted_dists_approx = np.sort(distances_approx[0])

    # Distances to exact neighbors are less than or equal to approximate
    # counterparts as the approximate radius query might have missed some
    # closer neighbors.
    assert_true(np.all(np.less_equal(sorted_dists_exact, sorted_dists_approx)))
Example #15
0
def optimal_solution():
    min_cost_flow = pywrapgraph.SimpleMinCostFlow()
    all_items = generate_all()
    drivers, driver_dict = all_items["drivers"]
    companies, company_dict = all_items["companies"]
    restaurants, restaurant_dict = all_items["restaurants"]
    orders, order_dict = all_items['orders']
    drivers_path = {}
    order_len = len(orders)

    # generate arcs from source node (0) to company users (company_inc)
    # sets the cost to 0 and the capacity to 1
    user_increment = 1
    for x in range(order_len):
        min_cost_flow.AddArcWithCapacityAndUnitCost(0, user_increment, 1, 0)
        user_increment += 1

    # generate arcs from male users (male_inc) to
    # sets the cost to 0 and the capacity to 1
    # the loops also generates array for driver locations for later
    driver_locations = []
    sink_index = len(companies) + len(drivers) + 1
    for driver in drivers:
        driver_locations.append(driver.location_in_rad())
        min_cost_flow.AddArcWithCapacityAndUnitCost(user_increment, sink_index,
                                                    1, 0)
        user_increment += 1

    # goes through every company and generates the shortest path between that companies'
    # resturants. Get the shortest path starting at each restaurant, we use a near
    # neighbor search to determine which drivers are in range. After finding all drivers
    # in range, it only adds an edge for the shortest total distance for each driver.
    neigh = NearestNeighbors(metric="haversine", algorithm="ball_tree")
    neigh.fit(driver_locations)

    for order in orders:
        driver_info = [None] * len(drivers)
        order_index = orders.index(order) + 1
        drivers_path[order_index] = {}
        shortest_paths_info = get_shortest_restaurant_paths(
            order.restaurants, order.company)

        for sp_info in shortest_paths_info:
            order_range = order.deadline - sp_info['distance']
            if order_range < 0.0: continue
            order_range_rad = deg2rad(order_range * 0.008)
            start_location = sp_info['order'][0].location_in_rad()
            rng = neigh.radius_neighbors([start_location],
                                         radius=order_range_rad)
            indices = np.asarray(rng[1][0])
            distances = np.asarray(rng[0][0])

            for index in range(len(indices)):
                real_index = indices[index]
                driver_index = index + order_len + 1
                distance = int(
                    (distances[index] * EARTH_RADIUS + sp_info['distance']) *
                    1000)
                if driver_info[real_index] is None or driver_info[real_index][
                        0] > distance:
                    driver_info[real_index] = (distance, driver_index, sp_info)

        for d_info in driver_info:
            if d_info is None: continue
            drivers_path[order_index][d_info[1]] = d_info[2]
            min_cost_flow.AddArcWithCapacityAndUnitCost(
                order_index, d_info[1], 1, d_info[0])

    # get the length of the limiting value, men or women. Set that to the supplies
    node_supplies = min(len(drivers), order_len)
    supplies = [node_supplies
                ] + ([0] * (order_len + len(drivers))) + [(-1) * node_supplies]
    for i in range(len(supplies)):
        min_cost_flow.SetNodeSupply(i, supplies[i])

    # calculate the max flow with min cost and save information about matches
    final_matches = [None] * order_len
    successful_matches = 0.0
    if min_cost_flow.SolveMaxFlowWithMinCost() == min_cost_flow.OPTIMAL:
        for arc in range(min_cost_flow.NumArcs()):
            if min_cost_flow.Tail(arc) != 0 and min_cost_flow.Head(
                    arc) != sink_index:
                if min_cost_flow.Flow(arc) > 0:
                    successful_matches += 1.0
                    tail = min_cost_flow.Tail(arc)
                    head = min_cost_flow.Head(arc)
                    orders[tail - 1].set_shortest_path(
                        drivers_path[tail][head])
                    final_matches[min_cost_flow.Tail(arc) -
                                  1] = (head - order_len,
                                        min_cost_flow.UnitCost(arc) / 1000.0)

    # print results
    driver_distance = int(min_cost_flow.OptimalCost())
    percent_matched = 0 if order_len == 0 else (successful_matches /
                                                order_len * 100)
    # print_data(final_matches, orders, percent_matched, driver_distance)
    # print(int(successful_matches), 0, int(successful_matches), 0)
    # return (int(successful_matches), 0, int(successful_matches), 0)
    return final_matches
Example #16
0
def mean_shift_cosine(X,
                      bandwidth=None,
                      seeds=None,
                      bin_seeding=False,
                      min_bin_freq=1,
                      cluster_all=True,
                      max_iter=300,
                      n_jobs=None):
    """Perform mean shift clustering of data using a flat kernel.

    Read more in the :ref:`User Guide <mean_shift>`.

    Parameters
    ----------

    X : array-like, shape=[n_samples, n_features]
        Input data.

    bandwidth : float, optional
        Kernel bandwidth.

        If bandwidth is not given, it is determined using a heuristic based on
        the median of all pairwise distances. This will take quadratic time in
        the number of samples. The sklearn.cluster.estimate_bandwidth function
        can be used to do this more efficiently.

    seeds : array-like, shape=[n_seeds, n_features] or None
        Point used as initial kernel locations. If None and bin_seeding=False,
        each data point is used as a seed. If None and bin_seeding=True,
        see bin_seeding.

    bin_seeding : boolean, default=False
        If true, initial kernel locations are not locations of all
        points, but rather the location of the discretized version of
        points, where points are binned onto a grid whose coarseness
        corresponds to the bandwidth. Setting this option to True will speed
        up the algorithm because fewer seeds will be initialized.
        Ignored if seeds argument is not None.

    min_bin_freq : int, default=1
       To speed up the algorithm, accept only those bins with at least
       min_bin_freq points as seeds.

    cluster_all : boolean, default True
        If true, then all points are clustered, even those orphans that are
        not within any kernel. Orphans are assigned to the nearest kernel.
        If false, then orphans are given cluster label -1.

    max_iter : int, default 300
        Maximum number of iterations, per seed point before the clustering
        operation terminates (for that seed point), if has not converged yet.

    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.

        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

        .. versionadded:: 0.17
           Parallel Execution using *n_jobs*.

    Returns
    -------

    cluster_centers : array, shape=[n_clusters, n_features]
        Coordinates of cluster centers.

    labels : array, shape=[n_samples]
        Cluster labels for each point.

    Notes
    -----
    For an example, see :ref:`examples/cluster/plot_mean_shift.py
    <sphx_glr_auto_examples_cluster_plot_mean_shift.py>`.

    """

    if bandwidth is None:
        bandwidth = estimate_bandwidth(X, n_jobs=n_jobs)
    elif bandwidth <= 0:
        raise ValueError("bandwidth needs to be greater than zero or None,\
            got %f" % bandwidth)
    if seeds is None:
        if bin_seeding:
            seeds = get_bin_seeds(X, bandwidth, min_bin_freq)
        else:
            seeds = X
    n_samples, n_features = X.shape
    center_intensity_dict = {}
    nbrs = NearestNeighbors(radius=bandwidth, n_jobs=n_jobs,
                            metric='cosine').fit(X)

    # execute iterations on all seeds in parallel
    all_res = Parallel(n_jobs=n_jobs)(
        delayed(_mean_shift_cosine_single_seed)(seed, X, nbrs, max_iter)
        for seed in seeds)
    # copy results in a dictionary
    for i in range(len(seeds)):
        if all_res[i] is not None:
            center_intensity_dict[all_res[i][0]] = all_res[i][1]

    if not center_intensity_dict:
        # nothing near seeds
        raise ValueError("No point was within bandwidth=%f of any seed."
                         " Try a different seeding strategy \
                         or increase the bandwidth." % bandwidth)

    # POST PROCESSING: remove near duplicate points
    # If the distance between two kernels is less than the bandwidth,
    # then we have to remove one because it is a duplicate. Remove the
    # one with fewer points.

    sorted_by_intensity = sorted(center_intensity_dict.items(),
                                 key=lambda tup: (tup[1], tup[0]),
                                 reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    nbrs = NearestNeighbors(radius=bandwidth, n_jobs=n_jobs,
                            metric='cosine').fit(sorted_centers)
    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = nbrs.radius_neighbors([center],
                                                  return_distance=False)[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]

    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    nbrs = NearestNeighbors(n_neighbors=1, n_jobs=n_jobs,
                            metric='cosine').fit(cluster_centers)
    labels = np.zeros(n_samples, dtype=np.int)
    distances, idxs = nbrs.kneighbors(X)
    if cluster_all:
        labels = idxs.flatten()
    else:
        labels.fill(-1)
        bool_selector = distances.flatten() <= bandwidth
        labels[bool_selector] = idxs.flatten()[bool_selector]
    return cluster_centers, labels
    def getPoi(self):
        self.poi = []

        # ---- Step 1 : Get events near to each stop point ---------------------------------------------------#
        associatedVisits = []
        neighboorsServerAll = NearestNeighbors(radius=self.distanceThres, algorithm="auto", leaf_size=10)
        neighboorsServerAll.fit(np.array([event.position for event in self.events]))
        for i in range(len(self.stops)):
            associatedVisits.append(set(neighboorsServerAll.radius_neighbors(self.stops[i].position)[1][0]))
        # ----------------------------------------------------------------------------------------------------#

        # ---- Step 2 : Merge stop points onto poi and merge there visits ------------------------------------#
        aggregatedPOI = np.array([-1] * len(self.stops))
        numberOfPOI = 0
        for i in range(len(self.stops)):
            inIntersection = []
            for j in range(len(self.stops)):
                if associatedVisits[i] & associatedVisits[j]:
                    inIntersection.append(j)
            poiId = min(aggregatedPOI[inIntersection])
            if poiId == -1:
                numberOfPOI += 1
                poiId = numberOfPOI
            aggregatedPOI[inIntersection] = poiId
        listPoiandNeighboorsIndex = []
        for poiId in set(aggregatedPOI):
            members = self.stops[aggregatedPOI == poiId]
            indices = np.array(range(len(self.stops)))[aggregatedPOI == poiId]
            neighboors = set()
            for i in indices:
                neighboors |= associatedVisits[i]
            poi = Poi(
                poiId,
                sum([p.longitude() for p in members]) / len(members),
                sum([p.latitude() for p in members]) / len(members),
            )
            listPoiandNeighboorsIndex.append((poi, sorted(neighboors)))
        # ----------------------------------------------------------------------------------------------------#

        # ---- Step 3 : Creating the list of Poi -------------------------------------------------------------#
        infrequentVisits = []
        for item in listPoiandNeighboorsIndex:
            poi, neighborsIndex = item
            j = neighborsIndex[0]
            for i in range(1, len(neighborsIndex)):
                if neighborsIndex[i] > neighborsIndex[i - 1] + 1:
                    visit = Visit(-1, self.events[j].time, self.events[neighborsIndex[i - 1] + 1].time)
                    j = neighborsIndex[i]
                    if visit.duration() >= self.stayTimeThres:
                        poi.addVisit(visit)
            k = neighborsIndex[-1] + 1 if (neighborsIndex[-1] + 1 < len(self.events)) else neighborsIndex[-1]
            visit = Visit(-1, self.events[j].time, self.events[k].time)
            if visit.duration() >= self.stayTimeThres:
                poi.addVisit(visit)

            if poi.visits:
                if len(poi.visits) >= self.freqThres:
                    self.poi.append(poi)
                else:
                    infrequentVisits.extend(poi.visits)

        if infrequentVisits:
            self.poi.append(
                Poi("I", float("nan"), float("nan"), sorted(infrequentVisits, key=lambda visit: visit.arrival))
            )
        # ----------------------------------------------------------------------------------------------------#
        self.finalize(self.poi, mergeVisits=False)
        return self.poi
            test_calc_polarity[i] = 1
        elif(test_calc_polarity[i]==2):
            test_calc_polarity[i]=0
    '''
    from sklearn.neighbors import NearestNeighbors
    import warnings
    warnings.filterwarnings("ignore")
    train_calc_polarity = []
    neigh = NearestNeighbors(30, radius =0.5, metric='cosine',algorithm='brute')
    neigh.fit(Embedding_reduced_train)  
    #lshf = LSHForest(random_state=42,n_neighbors=30, radius=0.5)
    #lshf.fit(X_train)  

    for i in range(len(probability)):
        
        nbrs =  neigh.radius_neighbors(Embedding_reduced_train[i], return_distance=False)
        #nbrs = lshf.kneighbors(Embedding_reduced_train[i], return_distance=False)
        tempdd = 0
        for j in range(len(nbrs[0])):
            tempdd = tempdd+probability[nbrs[0][j]]
        tempdd = tempdd/len(nbrs[0])
        probability[i] = tempdd

        inde = probability[i].argsort()[-2:][::-1]
        if((probability[i][inde[0]]-probability[i][inde[1]])<0):
            train_calc_polarity.append(0)
        else:
            if(inde[0]==0):
                train_calc_polarity.append(1)
            elif(inde[0]==1):
                train_calc_polarity.append(-1)
Example #19
0
def mean_shift(X,
               intensities=None,
               bandwidth=None,
               seeds=None,
               cluster_all=True,
               max_iterations=300,
               verbose=False,
               use_scipy=True):
    """mean_shift(X, intensities=None, bandwidth=None, seeds=None,
                  cluster_all=True, max_iterations=300, verbose=False, use_scipy=True)

    Mean shift algorithm

    Implementation taken from scikit-learn with two minor variants:

        - Use (by default) scipy KD-trees, which are faster in our case
        - weigthed version of mean-shift using `intensities` as
          weights (i.e., we compute centers of mass rather than means)

    Parameters
    ----------

    X : array-like, shape=[n_samples, n_features]
        Input data.

    intensities : array-like, shape=[n_samples]
        Voxel intensities, used to weight the mean

    bandwidth : float
        Kernel bandwidth.

    seeds : array-like, shape=[n_seeds, n_features]
        Point used as initial kernel locations.

    use_scipy : bool
        If true use cKDTree from scipy.spatial, otherwise
        use NearestNeighbors from sklearn.neighbors

    Returns
    -------

    cluster_centers : array, shape=[n_clusters, n_features]
        Coordinates of cluster centers.

    labels : array, shape=[n_samples]
        Cluster labels for each point.

    volumes : array, shape=[n_clusters]
        Volume of each cluster (# of points in the cluster)

    masses : array, shape=[n_clusters]
        Mass of each cluster (sum of intensities of points in the cluster).

    trajectories : list
        MS trajectories for debugging purposes.
    """
    if seeds is None:
        seeds = X
    n_points, n_features = X.shape
    stop_thresh = 1e-3 * bandwidth  # when mean has converged
    center_volume_dict = {}
    center_mass_dict = {}
    # tee.log('Fitting NearestNeighbors on', n_points, 'points')
    if use_scipy:
        kdtree = cKDTree(X)
    else:
        nbrs = NearestNeighbors(radius=bandwidth).fit(X)

    # For each seed, climb gradient until convergence or max_iterations
    trajectories = {}  # for each seed, a list of points
    tee.log('Moving kernels for', len(seeds), 'seeds')
    pbar = pb.ProgressBar(
        widgets=['Moving %d seeds: ' % len(seeds),
                 pb.Percentage()],
        maxval=len(seeds)).start()
    for seed_no, my_mean in enumerate(seeds):
        completed_iterations = 0
        seed = my_mean
        trajectories[seed_no] = []
        while True:
            # Find mean of points within bandwidth
            if use_scipy:
                i_nbrs = kdtree.query_ball_point(my_mean, r=bandwidth)
            else:
                i_nbrs = nbrs.radius_neighbors([my_mean],
                                               bandwidth,
                                               return_distance=False)[0]
            points_within = X[i_nbrs]
            if len(points_within) == 0:
                break  # Depending on seeding strategy this condition may occur
            my_old_mean = my_mean  # save the old mean
            if intensities is None:
                my_mean = np.mean(points_within, axis=0)
            else:
                my_mean = np.average(points_within,
                                     axis=0,
                                     weights=intensities[i_nbrs])
            # If converged or at max_iterations, addS the cluster
            if extmath.norm(
                    my_mean - my_old_mean
            ) < stop_thresh or completed_iterations == max_iterations:
                center_volume_dict[tuple(my_mean)] = len(points_within)
                center_mass_dict[tuple(my_mean)] = sum(intensities[i_nbrs])
                break
            completed_iterations += 1
            trajectories[seed_no].append(my_mean)
        if verbose:
            print('seed', seed, '-->', my_mean,
                  center_volume_dict[tuple(my_mean)],
                  center_mass_dict[tuple(my_mean)], completed_iterations)

        pbar.update(seed_no + 1)
    pbar.finish()
    # POST PROCESSING: remove near duplicate points
    # If the distance between two kernels is less than the bandwidth,
    # then we have to remove one because it is a duplicate. Remove the
    # one with fewer points.
    sorted_by_intensity = sorted(center_mass_dict.items(),
                                 key=lambda tup: tup[1],
                                 reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    print('started from', len(seeds), 'seeds, now |unique|=', len(unique))
    # print('|center_mass_dict|=', len(center_mass_dict))
    if len(center_mass_dict) == 0:
        tee.log('No valid seeds. Giving up')
        return None, None, None, None, None

    nbrs = NearestNeighbors(radius=bandwidth).fit(sorted_centers)
    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = nbrs.radius_neighbors([center],
                                                  return_distance=False)[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]
    print('|cluster_centers|=', len(cluster_centers))
    volumes = [0] * len(cluster_centers)
    masses = [0] * len(cluster_centers)
    for i, c in enumerate(cluster_centers):
        volumes[i] = center_volume_dict[tuple(c)]
        masses[i] = center_mass_dict[tuple(c)]
    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
    labels = np.zeros(n_points, dtype=np.int)
    distances, idxs = nbrs.kneighbors(X)
    if cluster_all:
        labels = idxs.flatten()
    else:
        labels[:] = -1
        bool_selector = distances.flatten() <= bandwidth
        labels[bool_selector] = idxs.flatten()[bool_selector]
    return cluster_centers, labels, volumes, masses, trajectories
    def find_recommendations(self, tweets=[], top=10, quality=.1, min_examples=1):

        working_list = []
        result_list = []
        try:
            config.LOGGER.info('Generating content recommendations for user %s',
                               self.account['profile']['preferredUsername'])
            if self.svd is not None:
                if len(tweets) < top:
                    config.LOGGER.debug("Too few tweets passed for recommendation")
                    return []

                #tokenized_tweets = [' '.join(doc['newKeys']) for doc in tweets]
                #tweetText = [tw['text'] for tw in tweets]
                tweetText = [' '.join(tw['keywords']) for tw in tweets]
                Y = self.vectorizer.transform(tweetText)
                svdY = self.svd.transform(Y)
                svdY = self.normalizer.transform(svdY)
                y_transform = self.k_means.transform(svdY)
                # terms = self.vectorizer.get_feature_names()

                selected_updates = []
                y_predict = self.k_means.predict(svdY)

                for i in range(self.cluster_count):
                    cluster_distance = []
                    for j in range(len(y_predict)):
                        if y_predict[j] == i and sum(svdY[j]) != 0.0:
                            cluster_distance.append(
                                {'index': j, 'cluster': i, 'dist': np.sqrt(sum([y * y for y in y_transform[j]]))})
                    newlist = sorted(cluster_distance, key=operator.itemgetter('dist'), reverse=False)
                    selected_updates.append(newlist)

                temp = [entry for entry in it.izip_longest(*selected_updates)]
                clean_list = filter(lambda x: x is not None, [entry for tuple in temp for entry in tuple])[0:top]
                clean_list_svdY = [svdY[entry['index']] for entry in clean_list]
                config.LOGGER.debug("Found %i possible matches in topic clusters " % len(clean_list_svdY))

                neigh = NearestNeighbors()
                neigh.fit(self.svdX)
                if len(clean_list_svdY) > 0:
                    distances, svd_neighbors = neigh.radius_neighbors(X=clean_list_svdY, radius=quality)
                else:
                    svd_neighbors =[]

                examples=[]
                for idx, entry in enumerate(svd_neighbors):
                    if len(entry) >= min_examples:
                        config.LOGGER.debug("Suggested tweet has %d examples" % len(entry))
                        original = tweets[clean_list[idx]['index']]['text']
                        for jdx, neighbor in enumerate(entry):
                            examples.append({'text':self.training_docs[neighbor]['text'], 'dist':distances[idx][jdx]})
                        sorted_examples = sorted(examples, key=operator.itemgetter('dist'), reverse=False)
                        min_examples = [item['text'] for item in sorted_examples][:min_examples]
                        t1 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][0]['index']]['text']
                        t2 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][1]['index']]['text']
                        working_list.append({"dist": sorted_examples[0]['dist'], "text": original,
                                                     "id": str(tweets[clean_list[idx]['index']]['_id']),
                                                     "sender": str(tweets[clean_list[idx]['index']]['sender']),
                                                     'samples_svd': min_examples, 'samples_cluster':[t1,t2]})

                result_list = sorted(working_list, key=operator.itemgetter('dist'), reverse=False)
            return result_list[:top]

        except Exception as ex:
            config.LOGGER.error("Error %s computing recommendations for mission %s", ex.message, self.missionId)
            return []
def mean_shift(X, bandwidth, max_iter):

    (m,n) = X.shape
    print m,n
    graph = tf.Graph()
    with graph.as_default():

        with tf.name_scope("input") as scope:
            data = tf.constant(X, name="data_points")
            b = tf.constant(bandwidth,dtype=tf.float32, name="bandwidth")
            m = tf.constant(max_iter, name="maximum_iteration")
            # n_samples = tf.constant(m, name="no_of_samples")
            # n_features = tf.constant(n, name="no_of_features")

        # with tf.name_scope("seeding") as scope:
        #     seed = tf.placeholder(tf.float32, [5], name="seed")

        with tf.name_scope("mean_shifting") as scope:
            old_mean = tf.placeholder(tf.float32, [n], name="old_mean")
            neighbors = tf.placeholder(tf.float32, [None,n], name="neighbors")
            new_mean = tf.reduce_mean(neighbors,0)

            euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(old_mean, new_mean), 2)), name="mean_distance")


        center_intensity_dict = {}
        nbrs = NearestNeighbors(radius=bandwidth).fit(X)

        sess = tf.Session()
        init = tf.initialize_all_variables()
        sess.run(init)
        writer = tf.train.SummaryWriter(FLAGS.log_dir, sess.graph_def)

        bin_sizes = defaultdict(int)

        data_point = tf.placeholder(tf.float32, [n],"data_point")
        binned_point = tf.floordiv(data_point,b)

        for point in X:
            feed={data_point:point}
            bp = sess.run(binned_point,feed_dict=feed)
            bin_sizes[tuple(bp)] +=1

        bin_seeds = np.array([point for point, freq in six.iteritems(bin_sizes) if freq >= 1], dtype=np.float32)

        bin_seeds = bin_seeds*bandwidth

        print len(bin_seeds)


        j=0

        for x in bin_seeds:
            print "Seed ",j,": ",x
            i = 0
            o_mean=x

            while True:
                i_nbrs = nbrs.radius_neighbors([o_mean], bandwidth, return_distance=False)[0]
                points_within = X[i_nbrs]

                feed = {neighbors: points_within}
                n_mean = sess.run(new_mean, feed_dict=feed)

                feed = {new_mean: n_mean, old_mean: o_mean}
                dist = sess.run(euclid_dist, feed_dict=feed)

                if dist < 1e-3*bandwidth or i==max_iter:
                    center_intensity_dict[tuple(n_mean)] = len(i_nbrs)
                    break
                else:
                    o_mean = n_mean

                print "\t",i,dist,len(i_nbrs)

                i+=1

            # if j>10:
            #     break

            j+=1

        print center_intensity_dict

        sorted_by_intensity = sorted(center_intensity_dict.items(),key=lambda tup: tup[1], reverse=True)
        sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
        unique = np.ones(len(sorted_centers), dtype=np.bool)
        nbrs = NearestNeighbors(radius=bandwidth).fit(sorted_centers)
        for i, center in enumerate(sorted_centers):
            if unique[i]:
                neighbor_idxs = nbrs.radius_neighbors([center],return_distance=False)[0]
                unique[neighbor_idxs] = 0
                unique[i] = 1  # leave the current point as unique
        cluster_centers = sorted_centers[unique]

        nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
        labels = np.zeros(154401, dtype=np.int)
        distances, idxs = nbrs.kneighbors(X)

        labels = idxs.flatten()
        return cluster_centers, labels
Example #22
0
def k_nearest_neighbors(coordinates,
                        neighbor_cutoff,
                        max_num_neighbors=None,
                        p_distance=2,
                        self_loops=False):
    """Find k nearest neighbors for each atom

    We do not guarantee that the edges are sorted according to the distance
    between atoms.

    Parameters
    ----------
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    self_loops : bool
        Whether to allow a node to be its own neighbor. Default to False.

    Returns
    -------
    srcs : list of int
        Source nodes.
    dsts : list of int
        Destination nodes, corresponding to ``srcs``.
    distances : list of float
        Distances between the end nodes, corresponding to ``srcs`` and ``dsts``.

    Examples
    --------
    >>> from dgllife.utils import get_mol_3d_coordinates, k_nearest_neighbors
    >>> from rdkit import Chem
    >>> from rdkit.Chem import AllChem

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> srcs, dsts, dists = k_nearest_neighbors(coords, neighbor_cutoff=1.25)
    >>> print(srcs)
    [8, 7, 11, 10, 20, 19]
    >>> print(dsts)
    [7, 8, 10, 11, 19, 20]
    >>> print(dists)
    [1.2084666104583117, 1.2084666104583117, 1.226457824344217,
     1.226457824344217, 1.2230522248065987, 1.2230522248065987]

    See Also
    --------
    get_mol_3d_coordinates
    mol_to_nearest_neighbor_graph
    smiles_to_nearest_neighbor_graph
    """
    num_atoms = coordinates.shape[0]
    model = NearestNeighbors(radius=neighbor_cutoff, p=p_distance)
    model.fit(coordinates)
    dists_, nbrs = model.radius_neighbors(coordinates)
    srcs, dsts, dists = [], [], []
    for i in range(num_atoms):
        dists_i = dists_[i].tolist()
        nbrs_i = nbrs[i].tolist()
        if not self_loops:
            dists_i.remove(0)
            nbrs_i.remove(i)
        if max_num_neighbors is not None and len(nbrs_i) > max_num_neighbors:
            packed_nbrs = list(zip(dists_i, nbrs_i))
            # Sort neighbors based on distance from smallest to largest
            packed_nbrs.sort(key=lambda tup: tup[0])
            dists_i, nbrs_i = map(list, zip(*packed_nbrs))
            dsts.extend([i for _ in range(max_num_neighbors)])
            srcs.extend(nbrs_i[:max_num_neighbors])
            dists.extend(dists_i[:max_num_neighbors])
        else:
            dsts.extend([i for _ in range(len(nbrs_i))])
            srcs.extend(nbrs_i)
            dists.extend(dists_i)

    return srcs, dsts, dists
Example #23
0
def k_nearest_neighbors(coordinates,
                        neighbor_cutoff,
                        max_num_neighbors=None,
                        p_distance=2,
                        self_loops=False):
    """Find k nearest neighbors for each atom

    We do not guarantee that the edges are sorted according to the distance
    between atoms.

    Parameters
    ----------
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    self_loops : bool
        Whether to allow a node to be its own neighbor. Default to False.

    Returns
    -------
    srcs : list of int
        Source nodes.
    dsts : list of int
        Destination nodes, corresponding to ``srcs``.
    distances : list of float
        Distances between the end nodes, corresponding to ``srcs`` and ``dsts``.
    """
    num_atoms = coordinates.shape[0]
    model = NearestNeighbors(radius=neighbor_cutoff, p=p_distance)
    model.fit(coordinates)
    dists_, nbrs = model.radius_neighbors(coordinates)
    srcs, dsts, dists = [], [], []
    for i in range(num_atoms):
        dists_i = dists_[i].tolist()
        nbrs_i = nbrs[i].tolist()
        if not self_loops:
            dists_i.remove(0)
            nbrs_i.remove(i)
        if max_num_neighbors is not None and len(nbrs_i) > max_num_neighbors:
            packed_nbrs = list(zip(dists_i, nbrs_i))
            # Sort neighbors based on distance from smallest to largest
            packed_nbrs.sort(key=lambda tup: tup[0])
            dists_i, nbrs_i = map(list, zip(*packed_nbrs))
            dsts.extend([i for _ in range(max_num_neighbors)])
            srcs.extend(nbrs_i[:max_num_neighbors])
            dists.extend(dists_i[:max_num_neighbors])
        else:
            dsts.extend([i for _ in range(len(nbrs_i))])
            srcs.extend(nbrs_i)
            dists.extend(dists_i)

    return srcs, dsts, dists
Example #24
0
def mean_shift_clustering(data, bandwidth=0.7, min_bin_freq=5, max_iter=300):
    """pipline of mean shift clustering.

    Args:
        data (np.ndarray)           : Input data with shape (n_samples, n_features)
        bandwidth (float)           : Bandwidth parameter for mean shift algorithm.
        min_bin_freq(int)           : Parameter for get_bin_seeds function.
                                      For each bin_seed, number of the minimal points should cover.
        max_iter (int)              : Max iteration for mean shift.

    Returns:
        labels (np.ndarray)         : Input/output integer array that stores the cluster indices for every sample.
                                      The shape is (n_samples, 1)
        centers (np.ndarray)        : Output matrix of the cluster centers, one row per each cluster center. 
                                      The shape is (k, n_features)
    """
    start = time()
    n_jobs = None
    seeds = get_bin_seeds(data, bandwidth, min_bin_freq)
    n_samples, n_features = data.shape
    center_intensity_dict = {}

    # We use n_jobs=1 because this will be used in nested calls under
    # parallel calls to _mean_shift_single_seed so there is no need for
    # for further parallelism.
    nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(data)
    # execute iterations on all seeds in parallel
    all_res = Parallel(n_jobs=n_jobs)(
        delayed(mean_shift_single_seed)
        (seed, data, nbrs, max_iter) for seed in seeds)

    # copy results in a dictionary
    for i in range(len(seeds)):
        if all_res[i] is not None:
            center_intensity_dict[all_res[i][0]] = all_res[i][1]

    if not center_intensity_dict:
        # nothing near seeds
        raise ValueError("No point was within bandwidth=%f of any seed."
                         " Try a different seeding strategy \
                         or increase the bandwidth."
                         % bandwidth)
    


    """ YOUR CODE STARTS HERE """
    # get all peaks of windows
    peaks = np.array(list(center_intensity_dict.keys()))
    # construct class for peaks
    nbrs_peaks = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(peaks)
    
    centers = set()
    for p in peaks:
        # find peaks within bandwidth
        nb = nbrs_peaks.radius_neighbors(np.expand_dims(p, axis=0))
        indices = nb[1][0]
        # if more than 1 peak
        if len(indices) > 1:
            keys = peaks[indices]
            duplicate_peaks = { tuple(k): center_intensity_dict[tuple(k)] for k in keys }
            max_key = tuple(max(duplicate_peaks, key=duplicate_peaks.get))
        else: 
            max_key = tuple(p)
        centers.add(max_key)

    centers = np.array(list(centers))
    
    # assign points to nearest cluster peak        
    nbrs_centers = NearestNeighbors(n_neighbors=1, n_jobs=1).fit(centers)
    d, labels = nbrs_centers.kneighbors(data)

    """ YOUR CODE ENDS HERE """
    end =  time()
    kmeans_runtime = end - start
    print("mean shift running time: %.3fs."% kmeans_runtime)
    return labels, centers
Example #25
0
class AnomalyModel:
	def __init__(self, trainingSet, anomalyMethod = "KNN", h = None ):
		self.method = anomalyMethod
		
		if self.method == "online":
			self.h = h
		
		if self.method == "centroid":
			self.h = Util.centroid( trainingSet )
		
		if self.method == "medoid":
			self.h = Util.medoid( trainingSet )
		
		if self.method == "IGNG":
			self.h = IGNG( radius = PARAMS["R"] ) # IGNG.estimate_radius( trainingSet )
			self.h.train( trainingSet )
			# print len( self.h.get_nodes_positions() ), len(trainingSet)
			
		if self.method == "GNG":
			self.h = GNG(period = 50)
			self.h.train( trainingSet )
			
		if self.method == "KNN":
			self.h = NearestNeighbors(algorithm='ball_tree', metric='euclidean').fit(trainingSet)
			
		if self.method == "RNN":
			self.h = NearestNeighbors(algorithm='ball_tree', metric='euclidean').fit(trainingSet)
			
		if self.method == "SVM":
			self.h = svm.OneClassSVM(nu=PARAMS["NU"], kernel="rbf", gamma=PARAMS["GAMMA"]).fit(trainingSet)
			
	def getAnomalyScore(self, x, inversed = False):
		if self.method == "online":
			alpha_m = self.h.getNearestDist(x) # alpha_m = self.h.getNearestDistToMature(x)
			if inversed == True: alpha_m = 1. / alpha_m
			
		if self.method == "centroid":
			alpha_m = Util.dist(x, self.h)
			if inversed == True: alpha_m = 1. / alpha_m
			
		if self.method == "medoid":
			alpha_m = Util.dist(x, self.h)
			if inversed == True: alpha_m = 1. / alpha_m
			
		if self.method == "IGNG":
			alpha_m = self.h.getNearestDist(x)
			if inversed == True: alpha_m = 1. / alpha_m
			
		if self.method == "GNG":
			alpha_m = self.h.getNearestDist(x)
			if inversed == True: alpha_m = 1. / alpha_m
			
		if self.method == "KNN":
			distances, indices = self.h.kneighbors( x, n_neighbors = PARAMS["K"] )
			alpha_m = sum( distances[0] )
			if inversed == True: alpha_m = 1. / alpha_m
			
		if self.method == "RNN":
			distances, indices = self.h.radius_neighbors(x, radius = PARAMS["R"])
			alpha_m = 1. / ( 1. + sum( [ 1./di for di in distances[0] if di != 0 ] ) )
			if inversed == True: alpha_m = 1. / alpha_m
		
		if self.method == "SVM":
			alpha_m = -1. * self.h.decision_function(x)[0][0]
			if inversed == True: alpha_m = -1. * alpha_m
		
		return alpha_m
Example #26
0
    def __init__(self,
                 root_dir,
                 cities='',
                 nNeg=5,
                 transform=None,
                 mode='train',
                 task='im2im',
                 subtask='all',
                 seq_length=1,
                 posDistThr=10,
                 negDistThr=25,
                 cached_queries=1000,
                 cached_negatives=1000,
                 positive_sampling=True,
                 bs=24,
                 threads=8,
                 margin=0.1,
                 exclude_panos=True):

        # initializing
        assert mode in ('train', 'val', 'test')
        assert task in ('im2im', 'im2seq', 'seq2im', 'seq2seq')
        assert subtask in ('all', 's2w', 'w2s', 'o2n', 'n2o', 'd2n', 'n2d')
        assert seq_length % 2 == 1
        assert (task == 'im2im' and seq_length == 1) or (task != 'im2im'
                                                         and seq_length > 1)

        if cities in default_cities:
            self.cities = default_cities[cities]
        elif cities == '':
            self.cities = default_cities[mode]
        else:
            self.cities = cities.split(',')

        self.qIdx = []
        self.qImages = []
        self.pIdx = []
        self.nonNegIdx = []
        self.dbImages = []
        self.sideways = []
        self.night = []

        self.all_pos_indices = []

        # hyper-parameters
        self.nNeg = nNeg
        self.margin = margin
        self.posDistThr = posDistThr
        self.negDistThr = negDistThr
        self.cached_queries = cached_queries
        self.cached_negatives = cached_negatives

        # flags
        self.cache = None
        self.exclude_panos = exclude_panos
        self.mode = mode
        self.subtask = subtask
        print('Exclude panoramas:', self.exclude_panos)

        # other
        self.transform = transform

        # define sequence length based on task
        if task == 'im2im':
            seq_length_q, seq_length_db = 1, 1
        elif task == 'seq2seq':
            seq_length_q, seq_length_db = seq_length, seq_length
        elif task == 'seq2im':
            seq_length_q, seq_length_db = seq_length, 1
        else:  # im2seq
            seq_length_q, seq_length_db = 1, seq_length

        # load data
        for city in self.cities:
            print("=====> {}".format(city))

            subdir = 'test' if city in default_cities['test'] else 'train_val'

            # get len of images from cities so far for indexing
            _lenQ = len(self.qImages)
            _lenDb = len(self.dbImages)

            # when GPS / UTM is available
            if self.mode in ['train', 'val']:
                # load query data
                qData = pd.read_csv(join(root_dir, subdir, city, 'query',
                                         'postprocessed.csv'),
                                    index_col=0)
                qDataRaw = pd.read_csv(join(root_dir, subdir, city, 'query',
                                            'raw.csv'),
                                       index_col=0)

                # load database data
                dbData = pd.read_csv(join(root_dir, subdir, city, 'database',
                                          'postprocessed.csv'),
                                     index_col=0)
                dbDataRaw = pd.read_csv(join(root_dir, subdir, city,
                                             'database', 'raw.csv'),
                                        index_col=0)

                # arange based on task
                qSeqKeys, qSeqIdxs = self.arange_as_seq(
                    qData, join(root_dir, subdir, city, 'query'), seq_length_q)
                dbSeqKeys, dbSeqIdxs = self.arange_as_seq(
                    dbData, join(root_dir, subdir, city, 'database'),
                    seq_length_db)

                # filter based on subtasks
                if self.mode in ['val']:
                    qIdx = pd.read_csv(join(root_dir, subdir, city, 'query',
                                            'subtask_index.csv'),
                                       index_col=0)
                    dbIdx = pd.read_csv(join(root_dir, subdir, city,
                                             'database', 'subtask_index.csv'),
                                        index_col=0)

                    # find all the sequence where the center frame belongs to a subtask
                    val_frames = np.where(qIdx[self.subtask])[0]
                    qSeqKeys, qSeqIdxs = self.filter(qSeqKeys, qSeqIdxs,
                                                     val_frames)

                    val_frames = np.where(dbIdx[self.subtask])[0]
                    dbSeqKeys, dbSeqIdxs = self.filter(dbSeqKeys, dbSeqIdxs,
                                                       val_frames)

                # filter based on panorama data
                if self.exclude_panos:
                    panos_frames = np.where(
                        (qDataRaw['pano'] == False).values)[0]
                    qSeqKeys, qSeqIdxs = self.filter(qSeqKeys, qSeqIdxs,
                                                     panos_frames)

                    panos_frames = np.where(
                        (dbDataRaw['pano'] == False).values)[0]
                    dbSeqKeys, dbSeqIdxs = self.filter(dbSeqKeys, dbSeqIdxs,
                                                       panos_frames)

                unique_qSeqIdx = np.unique(qSeqIdxs)
                unique_dbSeqIdx = np.unique(dbSeqIdxs)

                # if a combination of city, task and subtask is chosen, where there are no query/dabase images,
                # then continue to next city
                if len(unique_qSeqIdx) == 0 or len(unique_dbSeqIdx) == 0:
                    continue

                self.qImages.extend(qSeqKeys)
                self.dbImages.extend(dbSeqKeys)

                qData = qData.loc[unique_qSeqIdx]
                dbData = dbData.loc[unique_dbSeqIdx]

                # useful indexing functions
                seqIdx2frameIdx = lambda seqIdx, seqIdxs: seqIdxs[seqIdx]
                # frameIdx2seqIdx = lambda frameIdx, seqIdxs: np.where(seqIdxs == frameIdx)[0][1]
                frameIdx2uniqFrameIdx = lambda frameIdx, uniqFrameIdx: np.where(
                    np.in1d(uniqFrameIdx, frameIdx))[0]
                uniqFrameIdx2seqIdx = lambda frameIdxs, seqIdxs: \
                    np.where(np.in1d(seqIdxs, frameIdxs).reshape(seqIdxs.shape))[0]

                # utm coordinates
                utmQ = qData[['easting', 'northing']].values.reshape(-1, 2)
                utmDb = dbData[['easting', 'northing']].values.reshape(-1, 2)

                night, sideways, index = qData['night'].values, (
                    qData['view_direction'] == 'Sideways').values, qData.index

                # find positive images for training
                neigh = NearestNeighbors(algorithm='brute')
                neigh.fit(utmDb)
                pos_distances, pos_indices = neigh.radius_neighbors(
                    utmQ, self.posDistThr)
                self.all_pos_indices.extend(pos_indices)

                if self.mode == 'train':
                    nD, nI = neigh.radius_neighbors(utmQ, self.negDistThr)

                for q_seq_idx in range(len(qSeqKeys)):

                    q_frame_idxs = seqIdx2frameIdx(q_seq_idx, qSeqIdxs)
                    q_uniq_frame_idx = frameIdx2uniqFrameIdx(
                        q_frame_idxs, unique_qSeqIdx)

                    p_uniq_frame_idxs = np.unique([
                        p for pos in pos_indices[q_uniq_frame_idx] for p in pos
                    ])

                    # the query image has at least one positive
                    if len(p_uniq_frame_idxs) > 0:
                        p_seq_idx = np.unique(
                            uniqFrameIdx2seqIdx(
                                unique_dbSeqIdx[p_uniq_frame_idxs], dbSeqIdxs))

                        self.pIdx.append(p_seq_idx + _lenDb)
                        self.qIdx.append(q_seq_idx + _lenQ)

                        # in training we have two thresholds, one for finding positives and one for finding images
                        # that we are certain are negatives.
                        if self.mode == 'train':

                            n_uniq_frame_idxs = np.unique([
                                n for nonNeg in nI[q_uniq_frame_idx]
                                for n in nonNeg
                            ])
                            n_seq_idx = np.unique(
                                uniqFrameIdx2seqIdx(
                                    unique_dbSeqIdx[n_uniq_frame_idxs],
                                    dbSeqIdxs))

                            self.nonNegIdx.append(n_seq_idx + _lenDb)

                            # gather meta which is useful for positive sampling
                            if sum(night[np.in1d(index, q_frame_idxs)]) > 0:
                                self.night.append(len(self.qIdx) - 1)
                            if sum(sideways[np.in1d(index, q_frame_idxs)]) > 0:
                                self.sideways.append(len(self.qIdx) - 1)

            # when GPS / UTM / pano info is not available
            elif self.mode in ['test']:

                # load images for subtask
                qIdx = pd.read_csv(join(root_dir, subdir, city, 'query',
                                        'subtask_index.csv'),
                                   index_col=0)
                dbIdx = pd.read_csv(join(root_dir, subdir, city, 'database',
                                         'subtask_index.csv'),
                                    index_col=0)

                # arange in sequences
                qSeqKeys, qSeqIdxs = self.arange_as_seq(
                    qIdx, join(root_dir, subdir, city, 'query'), seq_length_q)
                dbSeqKeys, dbSeqIdxs = self.arange_as_seq(
                    dbIdx, join(root_dir, subdir, city, 'database'),
                    seq_length_db)

                # filter query based on subtask
                val_frames = np.where(qIdx[self.subtask])[0]
                qSeqKeys, qSeqIdxs = self.filter(qSeqKeys, qSeqIdxs,
                                                 val_frames)

                # filter database based on subtask
                val_frames = np.where(dbIdx[self.subtask])[0]
                dbSeqKeys, dbSeqIdxs = self.filter(dbSeqKeys, dbSeqIdxs,
                                                   val_frames)

                self.qImages.extend(qSeqKeys)
                self.dbImages.extend(dbSeqKeys)

                # add query index
                self.qIdx.extend(list(range(_lenQ, len(qSeqKeys) + _lenQ)))

                # if a combination of cities, task and subtask is chosen, where there are no query/database images,
                # then exit
        if len(self.qImages) == 0 or len(self.dbImages) == 0:
            print("Exiting...")
            print(
                "A combination of cities, task and subtask have been chosen, where there are no query/database images."
            )
            print("Try choosing a different subtask or more cities")
            sys.exit()

        # cast to np.arrays for indexing during training
        self.qIdx = np.asarray(self.qIdx)
        self.qImages = np.asarray(self.qImages)
        self.pIdx = np.asarray(self.pIdx)
        self.nonNegIdx = np.asarray(self.nonNegIdx)
        self.dbImages = np.asarray(self.dbImages)
        self.sideways = np.asarray(self.sideways)
        self.night = np.asarray(self.night)

        # decide device type ( important for triplet mining )
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.threads = threads
        self.bs = bs

        if mode == 'train':

            # for now always 1-1 lookup.
            self.negCache = np.asarray([np.empty(
                (0, ), dtype=int)] * len(self.qIdx))

            # calculate weights for positive sampling
            if positive_sampling:
                self.__calcSamplingWeights__()
            else:
                self.weights = np.ones(len(self.qIdx)) / float(len(self.qIdx))
Example #27
0
def pms(substack, args):
    """Find cells using mean shift.

    In this version, the substack is split into eight patches.

    Parameters
    ----------
    substack : object
        :class:`bcfind.volume.SubStack` object representing the substack to be analyzed.
    args : object
        :py:class:`argparse.Namespace` object containing the
        arguments passed to the find_cells script, in particular
        - args.outdir: directory where results are saved
        - args.hi_local_max_radius: radius of the sphere used to decide whether a local maximum should be a seed
        - args.mean_shift_bandwidth: bandwidth for the mean shift algorithm
        - args.floating_point: bool, whether cell coordinates should be rounded before saving
    """
    D = substack.info['Depth']
    W = substack.info['Width']
    H = substack.info['Height']
    M = 20
    patch = np.zeros((W, H, D))
    for z in range(D):
        patch[:, :, z] = np.array(substack.imgs[z]).T
    slicesx = [slice(0, W / 2 + M), slice(W / 2 - M, W)]
    slicesy = [slice(0, H / 2 + M), slice(H / 2 - M, H)]
    slicesz = [slice(0, D / 2 + M), slice(D / 2 - M, D)]
    cluster_centers = np.zeros((0, 3))
    cluster_masses = np.zeros(0)
    L = np.zeros((0, 3))
    labels = np.zeros(0)
    seeds = []
    counter = 0
    for sx in slicesx:
        for sy in slicesy:
            for sz in slicesz:
                counter += 1
                tee.log('%d/8:' % counter, 'Analyzing minisubstack', sx, sy,
                        sz)
                rval = _patch_ms(patch[sx, sy, sz], args)
                origin = [sx.start, sy.start, sz.start]
                if rval is not None:
                    cluster_centers = np.concatenate(
                        (cluster_centers, rval.cluster_centers + origin))
                    cluster_masses = np.concatenate(
                        (cluster_masses, rval.masses))
                    labels = np.concatenate(
                        (labels, rval.labels + len(rval.cluster_centers)))
                    L = np.concatenate((L, rval.L + origin))
                    for c in rval.seeds:
                        c.x += origin[0]
                        c.y += origin[1]
                        c.z += origin[2]
                    seeds.extend(rval.seeds)
    if len(cluster_centers) > 0:
        # remove near duplicate points (because of overlapping margins)
        indices = np.argsort(cluster_masses)
        sorted_centers = cluster_centers[indices]
        sorted_masses = cluster_masses[indices]
        # sorted_volumes = volumes[indices]
        unique = np.ones(len(sorted_centers), dtype=np.bool)
        # FIXME - make it a parameter
        nbrs = NearestNeighbors(radius=5.5).fit(sorted_centers)
        for i, center in enumerate(sorted_centers):
            if unique[i]:
                neighbor_idxs = nbrs.radius_neighbors([center],
                                                      return_distance=False)[0]
                unique[neighbor_idxs] = 0
                unique[i] = 1  # leave the current point as unique
        cluster_centers = sorted_centers[unique]
        masses = sorted_masses[unique]
        masses_mean = np.mean(masses)
        masses_std = np.std(masses)
        # volumes = sorted_volumes[unique]
    C = []
    for i, cc in enumerate(cluster_centers):
        c = volume.Center(cc[0], cc[1], cc[2])
        c.name = 'MS_center %d' % i
        c.volume = (masses[i] - masses_mean) / masses_std  # volumes[i]
        c.mass = masses[i]
        tee.log(i, cc, c)
        C.append(c)

    filename = args.outdir + '/ms.marker'
    substack.save_markers(filename, C, floating_point=args.floating_point)
    tee.log('Markers saved to', filename)
    filename = args.outdir + '/seeds.marker'
    substack.save_markers(filename, seeds)
    tee.log(len(seeds), 'seeds saved to', filename)

    up_outdir = dirname(abspath(args.outdir))
    if args.save_image:
        image_saver = volume.ImageSaver(up_outdir, substack, C)
        Lx = [int(x) for x in L[:, 0]]
        Ly = [int(y) for y in L[:, 1]]
        Lz = [int(z) for z in L[:, 2]]
        image_saver.save_above_threshold(Lx, Ly, Lz)

        tee.log('Debugging images saved in', up_outdir)
    else:
        tee.log('Debugging images not saved')
    def posq_rrtstar(self, sample_fn,bias = 0.06):
        """
        RRT* Algorithm
        """
        print "KINODYNAMIC PLANNING"
        marker_points = MarkerArray()

        vol_freecells = len(self._freecells)*self._navmap.info.resolution**2
        print "FREE CELL VOLUME", vol_freecells
        gamma_rrg = 2*sqrt(1.5*vol_freecells/pi)
        probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y,2*np.arccos(self._robot_pose.pose.orientation.w)])

        # V is a list of edges. E is a disctionary where the key is connected with its values.
        # parents is  a disctionary where parent of key is value. Since is a key, each node has only one parent
        V = [probot]
        p4dist = np.zeros(4)
        p4dist[:2] = probot[:2];p4dist[2] = np.cos(probot[2]);p4dist[3] = np.sin(probot[2])
        V_xy = [p4dist] # V_xy is the data structure for the nearest neighbours
        #V_xy = [probot[:2]]
        E = {}
        parents = {}
        Dist = [0.0]
        # C stores the cost at vertex idx which is hte sum of the edges going to it.
        goal_xy = np.array([self._goal.pose.position.x,self._goal.pose.position.y,2*np.arccos(self._goal.pose.orientation.w)])
        c_init = self.cost_manager.get_cost(probot[:2],goal_xy[:2])
        edge_C = {}
        planning_time=self.planning_time
        nbrs = NearestNeighbors(n_neighbors=1,algorithm="kd_tree",leaf_size = 30)
        lowest_cost_idx = None
        nbrs.fit(V_xy)
        t1 = time.time()
        planning_done = False
        rrt_iter = 0
        pub_path = []

        while not planning_done:
            t2 = time.time()
            #bias*=1.001 # gradually reduce the bias for the target
            """
            Sampling new point
            """
            reached = False
            samp_count = 0
            alternative = True
            while reached ==False:
                    
                prand,g_s = sample_fn(goal_xy,bias = bias)
                p4dist = np.zeros(4)
                p4dist[:2] = prand[:2];p4dist[2] = np.cos(prand[2]);p4dist[3] = np.sin(prand[2])
                #(dist, idx) = nbrs.kneighbors(prand.reshape(1, -1))
                (dist, idx) = nbrs.kneighbors(p4dist.reshape(1, -1))
                pnearest_idx = idx.flatten(1)[0]
                pnearest = V[pnearest_idx]

                """
                Turning new point into reachable point
                """
                stp = 50
                path_new,reached,stp = posq.simulate(pnearest,prand,steps = stp,return_steps=True)
                pnew = path_new[-1]
                if alternative == True:
                    d = prand[:2] - pnearest[:2]
                    ang = np.arctan2(d[1],d[0])

                    add = np.array([self._rrt_eta*np.cos(ang),self._rrt_eta*np.sin(ang),ang])
                    pnew =np.zeros(3)
                    pnew[:2] = pnearest[:2]+add[:2]
                    pnew[2] = ang                
                #self.publish_local_path(pub_path)
                #pnew = [pnearest[0]+ self._rrt_eta*np.cos(pnearest[2]),pnearest[1]+ self._rrt_eta*np.sin(pnearest[2]),pnearest[2]]
                if reached == True:
                    pnew = prand
                elif reached == False:
                    stp = 400
                    path_new,reached,stp = posq.simulate(pnearest,pnew,steps = stp,return_steps = True,eps=0.1)
                    pnew = path_new[-1]
                """
                Checking if segment is valid and updating graph
                """
                #stp = 40
                
                #stp = 30

            if self.path_safe(path_new):
                r = np.min([gamma_rrg*sqrt(log(len(V))/float(len(V))),self._rrt_eta])
                p4dist = np.zeros(4)
                p4dist[:2] = pnew[:2];p4dist[2] = np.cos(pnew[2]);p4dist[3] = np.sin(pnew[2])
                Pnear_idx = nbrs.radius_neighbors(p4dist.reshape(1, -1), r, return_distance = False)
                #Pnear_idx = nbrs.radius_neighbors(p4dist.reshape(1, -1), r, return_distance = False)
                Pnear_idx = Pnear_idx[0]
                pmin_idx = pnearest_idx
                min_edge_c = self.cost_manager.path_cost(path_new,goal_xy[:2])
                cum_c = self.integrate_costs(edge_C,parents,pnearest_idx)
                cmin = cum_c +min_edge_c
                #if len(Pnear_idx)>5:
                #   Pnear_idx = Pnear_idx[:5]
                cumulative_costs = []
                for p_idx in Pnear_idx:
                    p = V_xy[p_idx]
                    p_xyz = V[p_idx]

                    cum_cost = self.integrate_costs(edge_C,parents,p_idx)
                    cumulative_costs.append(cum_cost)
                    # WATCH OUT. You might get a nearest neightbour problem if the steps are not good enough.
                    # perhaps we can have a distance simulation so that the nearest neighbor calculation remains consistent.
                    p_idx_path,reached = posq.simulate(p_xyz,pnew,steps = int(stp),eps = 0.1)
                    #reached = False
                    safe = self.path_safe(p_idx_path)
                    if reached == True and safe == True:
                        path_c = self.cost_manager.path_cost(p_idx_path,goal_xy[:2])
                    else:
                        path_c = 0
                    #reached = False
                    c = cum_cost + path_c
                    if (safe is True and
                        reached is True and c < cmin):
                        cmin = c
                        min_edge_c = path_c
                        pmin_idx = p_idx      

                if E.has_key(pmin_idx):
                    E[pmin_idx].add(len(V))
                else:
                    E[pmin_idx] = set([len(V)])   
                edge_C[pmin_idx,len(V)] = min_edge_c  
                cumulative_last = cmin     
                pnew_idx = len(V)
                V.append(pnew)
                #V_xy.append(pnew[:2])
                V_xy.append(p4dist)
                parents[pnew_idx] = pmin_idx
                """
                Re-wire the tree
                """
                for en,p_idx in enumerate(Pnear_idx):
                    # so if the near nodes, have children
                    #parent
                    if parents.has_key(p_idx):
                        p = V_xy[p_idx]
                        p_xyz = V[p_idx]
                        rewire_path,rewire_reached = posq.simulate(pnew,p_xyz,steps = int(stp),eps = 0.1)
                        #rewire_reached = False
                        rewire_safe = self.path_safe(rewire_path)
                        if rewire_reached == True and rewire_safe == True:
                            rewire_path_c = self.cost_manager.path_cost(rewire_path,goal_xy[:2])
                        else:
                            rewire_path_c = 0
                        c = cumulative_last + rewire_path_c

                        if (rewire_safe is True and c < cumulative_costs[en] and rewire_reached is True):
                            E[parents[p_idx]].remove(p_idx)
                            edge_C.pop(parents[p_idx],p_idx)
                            edge_C[pnew_idx,p_idx] = rewire_path_c
                            parents[p_idx] = pnew_idx
                            if E.has_key(pnew_idx):
                                E[pnew_idx].add(p_idx)
                            else:
                                E[pnew_idx] = set([p_idx])
                nbrs.fit(V_xy)

            rrt_iter +=1

            if time.time()-t1>self.max_planning_time:
                p4dist = np.zeros(4)
                p4dist[:2] = goal_xy[:2];p4dist[2] = np.cos(goal_xy[2]);p4dist[3] = np.sin(goal_xy[2])
                dist,points_near_goal = nbrs.radius_neighbors(p4dist, self.goal_tolerance+0.2, return_distance = True)
                points_near_goal = points_near_goal[0]
                points_near_goal = []
                add = 0
                while len(points_near_goal)==0:
                    dist,points_near_goal = nbrs.radius_neighbors(p4dist, self.goal_tolerance+add, return_distance = True)
                    points_near_goal = points_near_goal[0]
                    add +=0.1
                print "Could not find solution for 10 seconds, going with solution closest to goal."
                planning_done = True
            elif time.time()-t1>planning_time:
                p4dist = np.zeros(4)
                p4dist[:2] = goal_xy[:2];p4dist[2] = np.cos(goal_xy[2]);p4dist[3] = np.sin(goal_xy[2])
                dist,points_near_goal = nbrs.radius_neighbors(p4dist, self.goal_tolerance+0.2, return_distance = True)
                #dist,points_near_goal = nbrs.radius_neighbors(goal_xy, self.goal_tolerance, return_distance = True)
                dist,point = nbrs.kneighbors(p4dist)   
                print "DISTANCE FROM CLOSEST",dist
                points_near_goal = points_near_goal[0]
                if len(points_near_goal)==0:
                    planning_done = False
                    planning_time+=5.
                    if bias < 0.5:
                        bias =0.9
                else:
                    planning_done = True
            #self.publish_rrt(V,E)     

        #self.samp_point_pub.publish(marker_points)
        """
        Find best path:
        """
        min_cost = 20000000;
        for i in points_near_goal:
            c_path = self.integrate_costs(edge_C,parents,i)
            if c_path < min_cost:
                m = i
                min_cost = c_path
        print len(V)
        self.publish_rrt(V,E)   
        print "MINIMUM PATH COST RRT",min_cost
        path = self.get_path(parents,V,m)
        pt = path_to_pose(path)            
        print 'total time: ', time.time()-t1 
        self._path_pub.publish(pt)
        return pt,path
Example #29
0
class NeighborSimilarity(ISimilarity):
    """The neighborhood similarity model.

    The neighbor similarity model determines similarity between the data
    in the indexing structure and the query data by using the nearest
    neighbor algorithm :class:`sklearn.neighbors.NearestNeighbors`.

    Both a k-neighbors classifier and a radius-neighbor-classifier are implemented.
    To choose between the classifiers either `n_neighbors` or `radius` must be
    specified.

    Parameters
    ----------
    n_neighbors : int
        The number of data points considered to be closest neighbors.
    radius : int
        The radius around the query data point, within which the data points
        are considered closest neighbors.
    algorithm : str
        The internal indexing structure of the training data. Defaults to
        `kd-tree`.
    metric : str
        The metric used to compute the distances between pairs of points.
        Refer to :class:`sklearn.neighbors.DistanceMetric` for valid
        identifiers. Default is `euclidean`.
    metric_params : dict
        Parameters relevant to the specified metric.

    Raises
    ------
    UserWarning :
        If the either both or none of `n_neighbors` and `radius` are given.

    See Also
    --------
    :class:`sklearn.neighbors.KNeighborsClassifier`, :class:`sklearn.neighbors.RadiusNeighborsClassifier`

    """
    def __init__(self, n_neighbors=None, radius=None, algorithm=None, metric=None, metric_params=None):
        super(NeighborSimilarity, self).__init__()

        if (n_neighbors is not None and radius is not None) or not (n_neighbors is None or radius is None):
            raise UserWarning("Exactly one of n_neighbors or radius must be initialized.")

        self._n_neighbors = n_neighbors
        self._radius = radius

        if algorithm is not None:
            if algorithm not in ["ball_tree", "kd_tree", "brute", "auto"]:
                raise ValueError("%s is not a valid retrieval algorithm" % algorithm)
            self._algorithm = algorithm
        else:
            self._algorithm = "kd_tree"

        if metric is not None:
            if metric not in METRIC_MAPPING:
                raise ValueError("%s is not a valid retrieval metric" % metric)
            self._metric = metric
        else:
            self._metric = "euclidean"

        self._metric_params = metric_params if metric_params is not None else 2

    def build_indexing_structure(self, data, id_map):
        """Build the indexing structure.

        Build the indexing structure by fitting the data according to the
        specified algorithm.

        Parameters
        ----------
        data : ndarray[ndarray[float]]
            The raw data points to be indexed.
        id_map : dict[int, int]
            The mapping from the data points to their case ids.
        """
        self._id_map = id_map

        if self._n_neighbors is not None:
            self._indexing_structure = NearestNeighbors(n_neighbors=self._n_neighbors, algorithm=self._algorithm,
                                                        metric=self._metric, p=self._metric_params).fit(data)
        else:
            self._indexing_structure = NearestNeighbors(radius=self._radius, algorithm=self._algorithm,
                                                        metric=self._metric, p=self._metric_params).fit(data)

    def compute_similarity(self, data_point):
        """Computes the similarity.

        Computes the similarity between the data point and the data in
        the indexing structure using the :class:`sklearn.neighbors.NearestNeighbors`
        algorithm. The results are returned in a collection of similarity statistics
        (:class:`Stat`).

        Parameters
        ----------
        data_point : list[float]
            The raw data point to compare against the data points stored in the
            indexing structure.

        Returns
        -------
        list[Stat] :
            A collection of similarity statistics.

        """
        if self._n_neighbors is not None:
            # noinspection PyProtectedMember
            raw_data = self._indexing_structure._fit_X
            if len(raw_data) < self._n_neighbors:
                result = []
                for i, feat in enumerate(raw_data):
                    dist = np.linalg.norm(np.asarray(data_point) - np.asarray(feat))
                    result.append(Stat(self._id_map[i], dist))

                # noinspection PyShadowingNames
                result = sorted(result, key=lambda x: x.similarity)
            else:
                d, key_lists = self._indexing_structure.kneighbors(data_point)
                result = [Stat(self._id_map[x], d[0][i]) for i, x in enumerate(key_lists[0])]

        else:
            d, key_lists = self._indexing_structure.radius_neighbors(data_point)
            result = [Stat(self._id_map[x], d[0][i]) for i, x in enumerate(key_lists[0])]
        return result
    def make_cached_rrt(self,sample_fn,points_to_cache = 4500,bias=0.02):
        """
        CAching the RRT
        """
        print "NOW CACHING RRT ---"
        marker_points = MarkerArray()
        vol_freecells = len(self._freecells)*self._navmap.info.resolution**2
        gamma_rrg = 2*sqrt(1.5*vol_freecells/pi)
        probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y,2*np.arccos(self._robot_pose.pose.orientation.w)])

        # V is a list of edges. E is a disctionary where the key is connected with its values.
        # parents is  a disctionary where parent of key is value. Since is a key, each node has only one parent
        V = [probot]
        p4dist = np.zeros(4)
        p4dist[:2] = probot[:2];p4dist[2] = np.cos(probot[2]);p4dist[3] = np.sin(probot[2])
        V_xy = [p4dist]

        sampled_points = []
        Dist = [0.0]
        # C stores the cost at vertex idx which is hte sum of the edges going to it.
        goal_xy = np.array([self._goal.pose.position.x,self._goal.pose.position.y,2*np.arccos(self._goal.pose.orientation.w)])
        edge_C = {}
        planning_time=self.planning_time
        nbrs = NearestNeighbors(n_neighbors=1,algorithm="kd_tree",leaf_size = 30)
        lowest_cost_idx = None
        nbrs.fit(V_xy)
        t1 = time.time()
        planning_done = False
        rrt_iter = 0
        bias = 0.02
        stp = 8
        while not planning_done:
            cached_nbrs ={}
            t2 = time.time()
            #bias/=1.001 # gradually reduce the bias for the target
            """
            Sampling new point
            """
            reached = False
            samp_count = 0
            alternative = True
            while reached ==False:
                    
                prand,g_s = sample_fn(goal_xy,bias = bias)
                p4dist = np.zeros(4)
                p4dist[:2] = prand[:2];p4dist[2] = np.cos(prand[2]);p4dist[3] = np.sin(prand[2])
                #(dist, idx) = nbrs.kneighbors(prand.reshape(1, -1))
                (dist, idx) = nbrs.kneighbors(p4dist.reshape(1, -1))
                pnearest_idx = idx.flatten(1)[0]
                pnearest = V[pnearest_idx]

                """
                Turning new point into reachable point
                """


                stp = 50
                path_new,reached,stp = posq.simulate(pnearest,prand,steps = stp,return_steps=True)
                pnew = path_new[-1]
                if alternative == True:
                    d = prand[:2] - pnearest[:2]
                    ang = np.arctan2(d[1],d[0])

                    add = np.array([self._rrt_eta*np.cos(ang),self._rrt_eta*np.sin(ang),ang])
                    pnew =np.zeros(3)
                    pnew[:2] = pnearest[:2]+add[:2]
                    pnew[2] = ang
                
                #self.publish_local_path(pub_path)
                #pnew = [pnearest[0]+ self._rrt_eta*np.cos(pnearest[2]),pnearest[1]+ self._rrt_eta*np.sin(pnearest[2]),pnearest[2]]
                if reached == True:
                    pnew = prand
                elif reached == False:
                    stp = 400
                    path_new,reached,stp = posq.simulate(pnearest,pnew,steps = stp,return_steps = True,eps=0.1)
                    pnew = path_new[-1]
                """
                Checking if segment is valid and updating graph
                """
                #stp = 40
                
                #stp = 30

            """
            Checking if segment is valid and updating graph
            """
            if self.path_safe(path_new) is True:

                r = np.min([gamma_rrg*sqrt(log(len(V))/float(len(V))),self._rrt_eta])
                p4dist = np.zeros(4)
                p4dist[:2] = pnew[:2];p4dist[2] = np.cos(pnew[2]);p4dist[3] = np.sin(pnew[2])
                #Pnear_idx = nbrs.radius_neighbors(pnew.reshape(1, -1)[:,:2], r, return_distance = False)
                Pnear_idx = nbrs.radius_neighbors(p4dist.reshape(1, -1), r, return_distance = False)
                Pnear_idx = Pnear_idx[0]
                cached_nbrs["prand"] = prand
                cached_nbrs["pnearest_idx"] = pnearest_idx
                cached_nbrs["path_new"] = path_new
                cached_nbrs["pnew"] = pnew
                cached_nbrs["path_new"] = path_new
                cached_nbrs["Pnear_idx"] = Pnear_idx
                cached_nbrs["Pnear_forward"] = []
                cached_nbrs["Pnear_backward"] = []
                cached_nbrs["pnear_pnew"] = []
                cached_nbrs["pnew_pnear"] = []
                for p_idx in Pnear_idx:
                    p = V_xy[p_idx]
                    p_xyz = V[p_idx]
                    #path_forward,reached_forward = posq.simulate(p_xyz,pnew,steps = int(stp))
                    path,reached = posq.simulate(p_xyz,pnew,steps = int(stp))
                    if reached == True:
                        safe = self.path_safe(path)
                        if safe is True:
                            path_info = ({"path":path,"reached":reached,"safe":safe})
                            cached_nbrs["pnear_pnew"].append(path_info)
                            cached_nbrs["Pnear_forward"].append(p_idx)
                    # else: 
                    #     path_forward_safe = None

                    path,reached = posq.simulate(pnew,p_xyz,steps = int(stp)) 
                    if reached == True:
                        safe = self.path_safe(path)
                        if safe == True:
                            path_info = ({"path":path,"reached":reached,"safe":safe})
                            cached_nbrs["pnew_pnear"].append(path_info)
                            cached_nbrs["Pnear_backward"].append(p_idx)
                    # else: 
                    #     path_backward_safe = None
                    #path_info = ({"forward":path_forward,"reached_forward":reached_forward,"safe_forward":path_forward_safe,
                    #"backward":path_backward,"reached_backward":reached_backward,"safe_backward":path_backward_safe})
                    #cached_nbrs["pnear_pnew"].append(path_info)
                V.append(pnew)
                V_xy.append(p4dist)
                nbrs.fit(V_xy)
                #mark = self.make_sample_marker(pnew)
                #marker_points.markers.append(mark)
                sampled_points.append(cached_nbrs)
            rrt_iter +=1
            if len(V) == points_to_cache-50:
                bias = 0.9
            if len(V) == points_to_cache:
                planning_done=True

                print "Number of cached points:",len(V)
                print "time taken",time.time()-t1
        return sampled_points
Example #31
0
    for i in range(nb_items):
        items[i, 0] = np.random.randint(0, 100)
        items[i, 1] = np.random.randint(0, 100)
        items[i, 2] = np.random.randint(0, 100)
        items[i, 3] = np.random.randint(0, 100)

    metrics = ['euclidean', 'hamming', 'jaccard']

    for metric in metrics:
        print('Metric: %r' % metric)

        # Fit k-nearest neighbors
        nn = NearestNeighbors(n_neighbors=10, radius=5.0, metric=metric)
        nn.fit(items)

        # Create a test product
        test_product = np.array([15, 60, 28, 73])

        # Determine the neighbors with different radiuses
        d, suggestions = nn.radius_neighbors(test_product.reshape(1, -1),
                                             radius=20)

        print('Suggestions (radius=10):')
        print(suggestions)

        d, suggestions = nn.radius_neighbors(test_product.reshape(1, -1),
                                             radius=30)

        print('Suggestions (radius=15):')
        print(suggestions)
Example #32
0
    plt.imsave('../images/baboon_mean_shift.png', babbon_img)

    fig = plt.figure(figsize=(30, 30))
    fig.add_subplot(1, 3, 1)
    plt.imshow(smooth_baboon[::2, ::2])
    plt.title("Smooth Baboon Image")
    plt.colorbar()
    fig.add_subplot(1, 3, 2)
    plt.imshow(babbon_img)
    plt.title("Mean Shift Image without clustering")
    plt.colorbar()

    #after calculating segmented image calculating clusters based on radius parameter across intensities values

    nbrs = NearestNeighbors(radius=0.007, algorithm='auto').fit(data_c[:, :3])
    dis, ind = nbrs.radius_neighbors(data_c[:, :3])

    new_data = np.copy(data_c)
    for i in range(len(ind)):
        nn = data_c[list(ind[i])]
        nn[:, :3] = new_data[i, :3]  # assign same pixel values with neighbours
        new_data[list(ind[i]), :3] = nn[:, :3]

    seg_img = np.zeros_like(baboon_array[::2, ::2])

    for d in new_data:
        seg_img[int(d[3]), int(d[4])] = d[:3]

    plt.imsave('../images/baboon_clustered.png', seg_img)

    fig.add_subplot(1, 3, 3)
Example #33
0
class NearestNeighborsRatioEstimator(object):
    """ Nearest neighbor ratio estimator
    """
    def __init__(self, n_neighbors=2):
        """ Instantiates the learner.
            n_neighbors: number of neighbors for KNN estimator
        """

        self.n_neighbors = n_neighbors

    def get_params(self, deep=True):
        """ Get parameters (for scikit-learn)
        """
        return {"n_neighbors": self.n_neighbors}

    def set_params(self, **parameters):
        """ Set parameters (for scikit-learn)
        """
        for parameter, value in parameters.items():
            self.setattr(parameter, value)

    def fit(self, X_tr, X_te):
        """ Fit the model
            X_tr: training sample
            X_te: test sample
        """
        self.n_tr = X_tr.shape[0]
        self.n_te = X_te.shape[0]

        # build kd-trees for both domains
        self.nbrs_tr = NearestNeighbors(n_neighbors=self.n_neighbors,
                                        algorithm='kd_tree').fit(X_tr)
        self.nbrs_te = NearestNeighbors(n_neighbors=self.n_neighbors,
                                        algorithm='kd_tree').fit(X_te)

    def fit_cv(self,
               X_tr,
               X_te,
               K_list,
               n_cv=5,
               n_jobs=0,
               shuffle=False,
               random_state=42):
        """ Fit the model using cross-validation to find an optimal number of K neighbors
            X_tr: training sample
            X_te: test sample
            K_list: list of K values to consider for cross-validation
            n_cv: number of folds for cross-validation
            n_jobs: number of jobs to use when running cross-validation in parallel,
                    If 0 or 1, run jobs sequentially.
                    If -1, take len(K_list) jobs.
        """
        if n_jobs == -1:
            n_jobs = len(K_list)

        kf_tr = KFold(n_splits=n_cv,
                      shuffle=shuffle,
                      random_state=random_state).split(X_tr)
        kf_te = KFold(n_splits=n_cv,
                      shuffle=shuffle,
                      random_state=random_state).split(X_te)
        kf_tr, kf_te = list(kf_tr), list(kf_te)
        if n_jobs == 0 or n_jobs == 1:
            self.losses = []
            for K in K_list:
                self.losses.append(cv_loss(X_tr, X_te, kf_tr, kf_te, K, n_cv))
        else:
            self.losses = Parallel(n_jobs=n_jobs)(
                delayed(cv_loss)(X_tr, X_te, kf_tr, kf_te, K, n_cv)
                for K in K_list)

        self.n_neighbors = K_list[np.argmin(self.losses)]
        print(f"Optimal K neighbors: {self.n_neighbors}")
        self.fit(X_tr, X_te)

    def compute_weights(self, X_ev):
        """ Predicts weights for a set of (evaluation) patterns.
            X_ev: sample of data points at which we evaluate the density ratio weights

            returns: weights for each data point in the given sample
        """
        # get K nearest neighbors (and radii) from training domain
        distances, ind = self.nbrs_tr.kneighbors(X_ev)
        radii = distances[:, -1]

        # compute weights
        weights = np.zeros(X_ev.shape[0])
        for i in range(X_ev.shape[0]):
            # count number of numerator sample within the current radius of the query sample
            weights[i] = len(
                self.nbrs_te.radius_neighbors(X_ev[i, :].reshape(1, -1),
                                              radius=radii[i],
                                              return_distance=False)[0])
        # divide K denominator samples and normalize by the ratio of denominator to numerator samples
        weights *= float(self.n_tr) / float(self.n_neighbors * self.n_te)

        return weights
Example #34
0
class Search:
    """
    This class is used to apply neighbourhood search algorithms on point clouds.
    K-nearest neighbours and radius search is implemented.
    """
    def __init__(self, inputCloud):
        """
        Inputs: inputCloud: np.array of row vectors shape (<numberPoints>, 3)
        """
        self.inputCloud = inputCloud

        self.initializer = 0

    def nearestKSearch(self, point, k):
        """
        Search for k-nearest neighbors for the given query point.

        Inputs: point   query point as vector with dimension (1,3) or (3,1)
                k       number of nearest neighbors that shall be included in
                            output set

        Outputs: outputPointSet     k points that are closest to query point.
                        Shape: (len,3) with 0<=len<=k meaning (x,y,z) for each
                        point in the set. Query point is also included at last
                        position.

        Based on method in:
        docs.pointclouds.org/1.9.1/classpcl_1_1search_1_1_search.html
        and
        https://scikit-learn.org/stable/modules/neighbors.html
        """

        'Column to row vector as required by method kneighbors'
        if point.shape == (3, 1) or point.shape == (3, ):
            point = np.reshape(point, (1, 3))

        'avoid error due to less points given than k'
        if self.inputCloud.shape[0] < k:
            k = self.inputCloud.shape[0]

        if self.initializer == 0:
            'Initialisation of nearest neighbour class'
            self.neigh = NearestNeighbors(n_neighbors=k)

            self.neigh.fit(self.inputCloud)

            self.initializer = 1

        'Calculate k nearest neighbours for given point'
        result = self.neigh.kneighbors(point)

        outputPointSet = result[1][0]
        'Note: May use result[0][0] to get distance for each pair'

        return outputPointSet

    def radiusSearch(self, point, radius, max_nn=float("inf")):
        '''
        Search for all the nearest neighbors of the query point in
        a given radius.

        Inputs: point   query point as vector with dimension (1,3) or (3,1)
                radius  radius of sphere to determine included points
                max_nn  maximum number of included points. Default infinit

        Outputs:    outputPointSet  points that are included in the sphere with
                        given radius around query point. Points on sphere
                        are included. Shape: (len,3) meaning (x,y,z) for each
                        point in the set. Query point is also included at last
                        position.

        Based on method in
        http://docs.pointclouds.org/1.9.1/classpcl_1_1search_1_1_search.html
        and
        https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsClassifier.html
        '''

        'Column to row vector as required by method radius_neighbors'
        if point.shape == (3, 1) or point.shape == (3, ):
            point = np.reshape(point, (1, 3))

        if self.initializer == 0:
            'Initialisation of nearest neighbour class'
            self.neigh = NearestNeighbors(radius=radius)
            self.neigh.fit(self.inputCloud)

            self.initializer = 1

        'Calculate nearest neighbours \
        Uses Minkowski distance with p=2 as default which is equal to \
        Euclidean distance'

        result = self.neigh.radius_neighbors(point)

        outputPointSet = result[1][0]
        'Note: May use result[0][0] to get distance for each point in \
        the sphere'

        return outputPointSet
Example #35
0
def path_freq(arr, voxdim, freq_threshold):

    vox = np.array(arr / voxdim).astype(int)

    voxels = defaultdict(list)
    for i, v in enumerate(vox):
        voxels[tuple(v)].append(i)

    vox_uniques = remove_duplicates(vox)
    vox_uniques = vox_uniques * voxdim
    #    vox_uids = np.arange(vox_uniques.shape[0])

    center = base_center(arr, base_length=0.3)[0]

    G = create_graph_iter(vox_uniques,
                          n_neighbors=5,
                          nn_step=2,
                          dist_threshold=np.inf,
                          maxiter=20)

    print('Graph created')
    nbrs = NearestNeighbors(leaf_size=15, n_jobs=-1).fit(vox_uniques)
    base_id = nbrs.kneighbors(center.reshape(1, -1), 1,
                              return_distance=False)[0][0]

    mask = np.zeros(G.number_of_nodes())

    # Calculating the shortest path
    shortpath = nx.single_source_dijkstra_path_length(G, base_id)
    # Obtaining the node coordinates and their respective distance from
    # the base point.
    nodes_ids = shortpath.keys()
    dist = shortpath.values()
    # Obtaining path list for every node.
    path = nx.single_source_dijkstra_path(G, base_id)
    # Obtaining nodes coordinates.
    nodes = vox_uniques[nodes_ids]

    dist = np.array(dist)

    path = path.values()
    path_nodes = [i for j in path for i in j]

    # Obtaining all unique values in the central nodes path and their
    # respective frequency.
    path_nodes, freq = np.unique(path_nodes, return_counts=True)

    # Log transforming the frequency values.
    freq_log = np.log(freq)

    # Filtering the central nodes based on the frequency of paths
    # that contains each node.
    freq_mask = (freq_log >= (np.max(freq_log) * freq_threshold)).astype(bool)
    p = nodes[freq_mask]
    pdist = dist[freq_mask]

    nbrs = NearestNeighbors(leaf_size=15, n_jobs=-1).fit(nodes)
    nbrs_ids = nbrs.radius_neighbors(p,
                                     radius=voxdim * 3,
                                     return_distance=False)

    mask[freq_mask.astype(bool)] = 1
    for p_id, idx in enumerate(nbrs_ids):
        for id_ in idx:
            if dist[id_] <= pdist[p_id]:
                mask[id_] = 1

    mask = mask.astype(bool)

    e = np.inf
    threshold = 1
    dist_threshold = voxdim * 4

    print('Starting region growing')
    while e > threshold:
        nbrs = NearestNeighbors(leaf_size=15,
                                n_jobs=-1).fit(vox_uniques[~mask])
        e1 = np.sum(mask)

        nbrs_dist, nbrs_ids = nbrs.kneighbors(vox_uniques[mask], 1)

        for i, nbr_i in enumerate(nbrs_ids[nbrs_dist <= dist_threshold]):
            if dist[nbr_i] <= dist[mask][i]:
                mask[nbr_i] = True

        e2 = np.sum(mask)
        e = e2 - e1
        #        e = nbrs_ids.shape[0]
        print e

    vids = []
    new_voxels = (vox_uniques[mask] / voxdim).astype(int)
    for i in new_voxels:
        vids.append(voxels[tuple(i)])
    voxels_ids = np.unique([i for j in vids for i in j])

    return arr[voxels_ids]
    def rrtstar(self, sample_fn):
        """
        RRT* Algorithm
        """
        vol_freecells = len(self._freecells)*self._navmap.info.resolution**2
        gamma_rrg = 2*sqrt(1.5*vol_freecells/pi)
        max_range = self.utility_function.getMaximumSensorRange()
        print 'max range is ', max_range
        
        probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y])
        V = [probot]
        E = {}
        parents = {}
        W = [self.current_weights]
        
        w = ap_utility.VectorOfDoubles()
        w_post = ap_utility.VectorOfDoubles()
        w.extend(self.current_weights)
        Ent = [self.utility_function.computeExpEntropy(probot[0], probot[1], 0.0, w, w_post)]
        Dist = [0.0]
        C = [float('Inf')]
        nbrs = NearestNeighbors(n_neighbors=1)
        nbrs.fit(V)
        cmin = 0
        t1 = time.time()
        
        informative_point_found = False
        planning_done = False
        rrt_iter = 0
        
        while not planning_done:
            t2 = time.time()
            """
            Sampling new point
            """
            prand = sample_fn()   
            (dist, idx) = nbrs.kneighbors(prand)
            pnearest_idx = idx.flatten(1)[0]
            pnearest = V[pnearest_idx]

            """
            Turning new point into reachable point
            """
            if dist < self._rrt_eta:
                pnew = prand
            else:
                pnew = self.steer(pnearest, prand)
            """
            Checking if segment is valid and updating graph
            """
            if self.segment_safe(V[pnearest_idx],pnew) is True:
		
                r = np.min([gamma_rrg*sqrt(log(len(V))/float(len(V))),self._rrt_eta])
                Pnear_idx = nbrs.radius_neighbors(pnew, r, return_distance = False)
                Pnear_idx = Pnear_idx[0]
                pmin_idx = pnearest_idx
                w = ap_utility.VectorOfDoubles()
                w_post = ap_utility.VectorOfDoubles()
                w.extend(W[pnearest_idx])

                (dist_nearest_particle, idx) = self._particle_nbrs.kneighbors(pnew)

                if dist_nearest_particle < max_range: #if at least one particle is visible
                    entropy = self.utility_function.computeExpEntropy(pnew[0], pnew[1], 0.0, w, w_post)
                
                if dist_nearest_particle >= max_range or entropy == 0: # utility function failed
                    entropy = Ent[pmin_idx]
                    w_post = w     
                
                dist = np.linalg.norm(pnearest-pnew)
                cmin = (self._rrt_near_bias*dist_nearest_particle +
                        self._rrt_dist_bias * (Dist[pnearest_idx] + dist) +
                        self._rrt_entropy_bias * entropy)

                for p_idx in Pnear_idx:
                    p = V[p_idx]
                    w = ap_utility.VectorOfDoubles()
                    w_near = ap_utility.VectorOfDoubles()
                    w.extend(W[p_idx])
                    
                    if np.abs(Ent[p_idx] - entropy) < 1e-6: # if there is anything to gain in terms of information
                        entropy_near = self.utility_function.computeExpEntropy(pnew[0], pnew[1], 0.0, w, w_near)
                    else:
                        entropy_near = Ent[p_idx]
                    
                    (dist_nearest_particle, idx) = self._particle_nbrs.kneighbors(p)
                    
                    c = (self._rrt_near_bias*dist_nearest_particle + 
                         self._rrt_dist_bias * (Dist[p_idx] + np.linalg.norm(p-pnew)) + 
                         self._rrt_entropy_bias * entropy_near)
                    if (self.segment_safe(p,pnew) is True and 
                        c < cmin):
                        cmin = c
                        pmin_idx = p_idx
                
                if E.has_key(pmin_idx):
                    E[pmin_idx].add(len(V))
                else:
                    E[pmin_idx] = set([len(V)])
                
                pnew_idx = len(V)
                V.append(pnew)
                C.append(cmin)
                W.append(w_post)
                Ent.append(entropy)
                Dist.append(Dist[pmin_idx] + dist)
                parents[pnew_idx] = pmin_idx
                """
                Re-wire the tree
                """
                for p_idx in Pnear_idx:
                    if parents.has_key(p_idx):
                        p = V[p_idx]
                        w = ap_utility.VectorOfDoubles()
                        w_near = ap_utility.VectorOfDoubles()
                        w.extend(W[-1]) # pnew
                        if np.abs(Ent[p_idx] - entropy) <= 0: #1e-6: # if there is anything to gain in terms of information
                            entropy_near = self.utility_function.computeExpEntropy(pnew[0], pnew[1], 0.0, w, w_near)
                        else:
                            entropy_near = Ent[p_idx]
                            w_near = w
                        dist = np.linalg.norm(p-pnew)
                        (dist_nearest_particle, idx) = self._particle_nbrs.kneighbors(p)
                         
                        c = (self._rrt_near_bias*dist_nearest_particle + 
                             self._rrt_dist_bias * (Dist[-1] + dist) + 
                             self._rrt_entropy_bias * entropy_near)
                        if (self.segment_safe(p,pnew) is True and 
                            c < C[p_idx]):
                            E[parents[p_idx]].remove(p_idx)
                            parents[p_idx] = pnew_idx
                            print 'rewired ',p_idx,'to',pnew_idx
                            if E.has_key(pnew_idx):
                                E[pnew_idx].add(p_idx)
                            else:
                                E[pnew_idx] = set([p_idx])
                            C[p_idx] = c
                            W[p_idx] = w_near
                            Ent[p_idx] = entropy_near
                            Dist[p_idx] = Dist[-1] + dist
                nbrs.fit(V)
            # print 'iteration done. time: ', time.time()-t2
            # print 'min entropy:', np.min(I)
            if np.max(Ent) - np.min(Ent) >= 1e-6: # just to compensate arithmetic noise
                informative_point_found = True

            """
            Find best path:
            """
            
            path = self.get_best_path(parents, V, C)

            if informative_point_found and len(path.poses) > self._max_path_size:
                planning_done = True

            rrt_iter += 1
            
            if rrt_iter > self._max_rrt_iterations:
                planning_done = True
                if not informative_point_found:
                    rospy.logwarn("Could not find an informative goal point in %d iterations! Aborting.", self._max_rrt_iterations)
                    
        print 'total time: ', time.time()-t1
        
        self.publish_rrt(V,E)
        
        self._path_pub.publish(path)
        
        self.publish_entropy_info(V, Ent)
        return path
Example #37
0
def mean_shift(X,
               bandwidth,
               n_seeds,
               kernel_function='gaussian',
               max_iterations=100,
               proximity_thresh=5):
    '''
    ---Parameters---
    X : data in form (samples, dims)
    bandwidth : radius of nearest neighbors
    n_seeds : 
    kernel_update_function : can be "gaussian" or "flat" or your own kernel
    proximity_thresh : minimum distance (in pixels) a new cluster must be away from previous ones

    ---Returns---
    cluster_centers : 
    cluster_counts : how many pixels are with the neighborhood of each cluster
    '''

    import numpy as np
    from sklearn.neighbors import BallTree, NearestNeighbors
    from sklearn.utils import extmath
    from sklearn.metrics.pairwise import euclidean_distances
    from collections import defaultdict

    if kernel_function == 'gaussian':
        kernel_update_function = gaussian_kernel
    elif kernel_function == 'flat':
        kernel_update_function = flat_kernel
    else:
        kernel_update_function = kernel_function

    n_points, n_features = X.shape
    stop_thresh = 1e-2 * bandwidth  # when mean has converged
    cluster_centers = []
    cluster_counts = []
    # ball_tree = BallTree(X)# to efficiently look up nearby points
    neighbors = NearestNeighbors(radius=bandwidth).fit(X)

    seeds = X[(np.random.uniform(0, X.shape[0], n_seeds)).astype(np.int)]

    # For each seed, climb gradient until convergence or max_iterations
    for weighted_mean in seeds:
        completed_iterations = 0
        while True:
            points_within = X[neighbors.radius_neighbors(
                [weighted_mean], bandwidth, return_distance=False)[0]]
            old_mean = weighted_mean  # save the old mean
            weighted_mean = kernel_update_function(old_mean, points_within,
                                                   bandwidth)
            converged = extmath.norm(weighted_mean - old_mean) < stop_thresh
            if converged or completed_iterations == max_iterations:
                # Only add cluster if it's different enough from other centers
                if len(cluster_centers) > 0:
                    diff_from_prev = [
                        np.linalg.norm(weighted_mean - cluster_centers[i], 2)
                        for i in range(len(cluster_centers))
                    ]
                    if np.min(diff_from_prev) > proximity_thresh:
                        cluster_centers.append(weighted_mean)
                        cluster_counts.append(points_within.shape[0])
                else:
                    cluster_centers.append(weighted_mean)
                    cluster_counts.append(points_within.shape[0])
                break
            completed_iterations += 1

    return cluster_centers, cluster_counts
    def __init__(self,
                 nNegSample=1000,
                 nNeg=10,
                 margin=0.1,
                 input_transform=None):
        super().__init__()

        self.img_list = join(root, yeouido_img_list_txt)
        self.images = [e.strip() for e in open(self.img_list)]
        self.input_transform = input_transform()

        self.nNegSample = nNegSample  # number of negatives to randomly sample
        self.nNeg = nNeg  # number of negatives used for training
        self.margin = margin

        self.position = np.load(yeouido_position_npy)
        self.positive_thres = 5
        self.negative_thres = 20

        All_idx = np.arange(0, len(self.images), step=1)
        self.Qidx = np.arange(0, len(self.images), step=4)
        self.DBidx = np.setdiff1d(All_idx, self.Qidx)
        #self.DBidx = np.arange(int(len(self.images)/2)) *2
        #self.Qidx = self.DBidx +1

        np.random.shuffle(self.DBidx)
        np.random.shuffle(self.Qidx)

        # potential positives are those within nontrivial threshold range
        #fit NN to find them, search by radius
        #knn = kNN_GPU(d=len(get_multiple_elements(self.position,self.DBidx)[0]), GPU = True, GPU_Number=torch.cuda.current_device())
        #knn.train(np.array(get_multiple_elements(self.position,self.DBidx)).astype("float32"))
        knn_cpu = NearestNeighbors(n_jobs=-1, metric='euclidean')
        knn_cpu.fit(get_multiple_elements(self.position, self.DBidx))

        # potential negatives are those outside of posDistThr range
        self.potential_positives = knn_cpu.radius_neighbors(
            get_multiple_elements(self.position, self.Qidx),
            radius=self.positive_thres,
            return_distance=False)

        #self.potential_positives = knn.predict(np.asarray(get_multiple_elements(self.position,self.Qidx)).astype("float32"), 10)

        # sort indecies of potential positives
        for i, positive_indices in enumerate(self.potential_positives):
            self.potential_positives[i] = np.sort(positive_indices)

        # it's possible some queries don't have any non trivial potential positives
        self.queries = np.where(
            np.array([len(x) for x in self.potential_positives]) > 0)[0]

        # for potential negatives
        potential_unnegatives = knn_cpu.radius_neighbors(
            get_multiple_elements(self.position, self.Qidx),
            radius=self.negative_thres,
            return_distance=False)
        #potential_unnegatives = knn.predict(np.asarray(get_multiple_elements(self.position,self.Qidx)).astype("float32"), 20)

        # potential negatives' indices of DBidx away then 25 meters
        self.potential_negatives = []
        for pos in potential_unnegatives:
            self.potential_negatives.append(
                np.setdiff1d(np.arange(self.DBidx.shape[0]),
                             pos,
                             assume_unique=True))

        self.cache = None  # filepath of HDF5 containing feature vectors for images
        self.negCache = [np.empty((0, )) for _ in range(self.Qidx.shape[0])]
Example #39
0
    def dbscan(self,
               X,
               eps=0.5,
               min_samples=5,
               metric='minkowski',
               metric_params=None,
               algorithm='auto',
               leaf_size=30,
               p=2,
               sample_weight=None,
               n_jobs=None):
        """Perform DBSCAN clustering from vector array or distance matrix.
        Read more in the :ref:`User Guide <dbscan>`.
        Parameters
        ----------
        X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
                array of shape (n_samples, n_samples)
            A feature array, or array of distances between samples if
            ``metric='precomputed'``.
        eps : float, optional
            The maximum distance between two samples for one to be considered
            as in the neighborhood of the other. This is not a maximum bound
            on the distances of points within a cluster. This is the most
            important DBSCAN parameter to choose appropriately for your data set
            and distance function.
        min_samples : int, optional
            The number of samples (or total weight) in a neighborhood for a point
            to be considered as a core point. This includes the point itself.
        metric : string, or callable
            The metric to use when calculating distance between instances in a
            feature array. If metric is a string or callable, it must be one of
            the options allowed by :func:`sklearn.metrics.pairwise_distances` for
            its metric parameter.
            If metric is "precomputed", X is assumed to be a distance matrix and
            must be square. X may be a sparse matrix, in which case only "nonzero"
            elements may be considered neighbors for DBSCAN.
        metric_params : dict, optional
            Additional keyword arguments for the metric function.
            .. versionadded:: 0.19
        algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
            The algorithm to be used by the NearestNeighbors module
            to compute pointwise distances and find nearest neighbors.
            See NearestNeighbors module documentation for details.
        leaf_size : int, optional (default = 30)
            Leaf size passed to BallTree or cKDTree. This can affect the speed
            of the construction and query, as well as the memory required
            to store the tree. The optimal value depends
            on the nature of the problem.
        p : float, optional
            The power of the Minkowski metric to be used to calculate distance
            between points.
        sample_weight : array, shape (n_samples,), optional
            Weight of each sample, such that a sample with a weight of at least
            ``min_samples`` is by itself a core sample; a sample with negative
            weight may inhibit its eps-neighbor from being core.
            Note that weights are absolute, and default to 1.
        n_jobs : int or None, optional (default=None)
            The number of parallel jobs to run for neighbors search.
            ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
            ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
            for more details.
        Returns
        -------
        core_samples : array [n_core_samples]
            Indices of core samples.
        labels : array [n_samples]
            Cluster labels for each point. 
        """
        if not eps > 0.0:
            raise ValueError("eps must be positive.")

        X = check_array(X, accept_sparse='csr')
        if sample_weight is not None:
            sample_weight = np.asarray(sample_weight)
            check_consistent_length(X, sample_weight)

        # Calculate neighborhood for all samples. This leaves the original point
        # in, which needs to be considered later (i.e. point i is in the
        # neighborhood of point i. While True, its useless information)
        if metric == 'precomputed' and sparse.issparse(X):
            neighborhoods = np.empty(X.shape[0], dtype=object)
            X.sum_duplicates()  # XXX: modifies X's internals in-place

            # set the diagonal to explicit values, as a point is its own neighbor
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)
                X.setdiag(X.diagonal())  # XXX: modifies X's internals in-place

            X_mask = X.data <= eps
            masked_indices = X.indices.astype(np.intp, copy=False)[X_mask]
            masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))
            masked_indptr = masked_indptr[X.indptr[1:-1]]

            # split into rows
            neighborhoods[:] = np.split(masked_indices, masked_indptr)
        else:
            neighbors_model = NearestNeighbors(radius=eps,
                                               algorithm=algorithm,
                                               leaf_size=leaf_size,
                                               metric=metric,
                                               metric_params=metric_params,
                                               p=p,
                                               n_jobs=n_jobs)
            neighbors_model.fit(X)
            # This has worst case O(n^2) memory complexity
            neighborhoods = neighbors_model.radius_neighbors(
                X, eps, return_distance=False)

        if sample_weight is None:
            n_neighbors = np.array(
                [len(neighbors) for neighbors in neighborhoods])
        else:
            n_neighbors = np.array([
                np.sum(sample_weight[neighbors]) for neighbors in neighborhoods
            ])

        # Initially, all samples are noise.
        labels = np.full(X.shape[0], -1, dtype=np.intp)

        # A list of all core samples found.
        core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
        dbscan_inner(core_samples, neighborhoods, labels)
        return np.where(core_samples)[0], labels
Example #40
0
class GlobalKMeans(BaseEstimator, ClusterMixin, TransformerMixin):
    """Global K-means Algorithm

    Paramereters:

    n_clusters: int
        maximum number of clusters to obtain
    algorithm string
        'classical' the classical algorithm
        'bagirov' the Bagirov 2006 variant

    """
    def __init__(self, n_clusters, algorithm='classical'):
        self.n_clusters = n_clusters
        self.cluster_centers_ = None
        self.labels_ = None
        self.cluster_sizes_ = None
        self.inertia_ = None
        self.algorithm = algorithm

    def fit(self, X):
        """
        Clusters the examples
        :param X:
        :return:
        """

        if self.algorithm == 'classical':
            self.cluster_centers_, self.labels_, self.inertia_ = self._fit_process(
                X)
        elif self.algorithm == 'bagirov':
            self.cluster_centers_, self.labels_, self.inertia_ = self._fit_process_bagirov(
                X)

        return self

    def predict(self, X):
        """
        Returns the nearest cluster for a data matrix

        @param X:
        @return:
        """
        clasif = []
        for i in range(X.shape[0]):
            ncl, mdist = self._find_nearest_cluster(X[i].reshape(1, -1),
                                                    self.cluster_centers_)
            if mdist <= self.radius:
                clasif.append(ncl)
            else:
                clasif.append(-1)
        return clasif

    def _fit_process(self, X):
        """
        Classical global k-means algorithm

        :param X:
        :return:
        """

        # Compute the centroid of the dataset
        centroids = sum(X) / X.shape[0]
        centroids.shape = (1, X.shape[1])

        for i in range(2, self.n_clusters + 1):
            mininertia = np.infty
            for j in range(X.shape[0]):
                newcentroids = np.vstack((centroids, X[j]))
                #print newcentroids.shape
                km = KMeans(n_clusters=i, init=newcentroids, n_init=1)
                km.fit(X)
                if mininertia > km.inertia_:
                    mininertia = km.inertia_
                    bestkm = km
            centroids = bestkm.cluster_centers_

        return bestkm.cluster_centers_, bestkm.labels_, bestkm.inertia_

    def _fit_process_bagirov(self, X):
        """
        Clusters using the global K-means algorithm Bagirov variation
        :param X:
        :return:
        """

        # Create a KNN structure for fast search
        self._neighbors = NearestNeighbors()
        self._neighbors.fit(X)

        # Compute the centroid of the dataset
        centroids = sum(X) / X.shape[0]
        assignments = [0 for i in range(X.shape[0])]

        centroids.shape = (1, X.shape[1])

        # compute the distance of the examples to the centroids
        mindist = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            mindist[i] = euclidean_distances(X[i].reshape(1, -1),
                                             centroids[assignments[i]].reshape(
                                                 1, -1),
                                             squared=True)[0]

        for k in range(2, self.n_clusters + 1):
            newCentroid = self._compute_next_centroid(X, centroids,
                                                      assignments, mindist)
            centroids = np.vstack((centroids, newCentroid))
            km = KMeans(n_clusters=k, init=centroids, n_init=1)
            km.fit(X)
            assignments = km.labels_
            for i in range(X.shape[0]):
                mindist[i] = euclidean_distances(
                    X[i].reshape(1, -1),
                    centroids[assignments[i]].reshape(1, -1),
                    squared=True)[0]

        return km.cluster_centers_, km.labels_, km.inertia_

    def _compute_next_centroid(self, X, centroids, assignments, mindist):
        """
        Computes the candidate for the next centroid

        :param X:
        :param centroids:
        :return:
        """
        minsum = np.infty
        candCentroid = None

        # Compute the first candidate to new centroid
        for i in range(X.shape[0]):
            distance = euclidean_distances(
                X[i].reshape(1, -1), centroids[assignments[i]].reshape(1,
                                                                       -1))[0]
            S2 = self._neighbors.radius_neighbors(X[i].reshape(1, -1),
                                                  radius=distance,
                                                  return_distance=False)[0]
            S2centroid = np.sum(X[S2], axis=0) / len(S2)
            S2centroid.shape = (1, X.shape[1])
            cost = self._compute_fk(X, mindist, S2centroid)

            if cost < minsum:
                minsum = cost
                candCentroid = S2centroid

        # Compute examples for the new centroid
        S2 = []
        newDist = euclidean_distances(X,
                                      candCentroid.reshape(1, -1),
                                      squared=True)
        for i in range(X.shape[0]):
            if newDist[i] < mindist[i]:
                S2.append(i)

        newCentroid = sum(X[S2]) / len(S2)
        newCentroid.shape = (1, X.shape[1])

        while not (candCentroid == newCentroid).all():
            candCentroid = newCentroid
            S2 = []
            newDist = euclidean_distances(X,
                                          candCentroid.reshape(1, -1),
                                          squared=True)
            for i in range(X.shape[0]):
                if newDist[i] < mindist[i]:
                    S2.append(i)

            newCentroid = np.sum(X[S2], axis=0) / len(S2)
            newCentroid.shape = (1, X.shape[1])

        return candCentroid

    def _compute_fk(self, X, mindist, ccentroid):
        """
        Computes the cost function

        :param X:
        :param mindist:
        :param ccentroid:
        :return:
        """

        # Distances among the examples and the candidate centroid
        centdist = euclidean_distances(X,
                                       ccentroid.reshape(1, -1),
                                       squared=True)

        fk = 0
        for i in range(X.shape[0]):
            fk = fk + min(mindist[i], centdist[i][0])

        return fk

    @staticmethod
    def _find_nearest_cluster(examp, centers):
        """
        Finds the nearest cluster for an example
        :param examp:
        :param centers:
        :return:
        """

        dist = euclidean_distances(centers, examp.reshape(1, -1))

        pmin = np.argmin(dist)
        vmin = np.min(dist)

        return pmin, vmin
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1,))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries,))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries,))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)

    # Radius-based queries do not sort the result points and the order
    # depends on the method, the random_state and the dataset order. Therefore
    # we need to sort the results ourselves before performing any comparison.
    sorted_dists_exact = np.sort(distances_exact[0])
    sorted_dists_approx = np.sort(distances_approx[0])

    # Distances to exact neighbors are less than or equal to approximate
    # counterparts as the approximate radius query might have missed some
    # closer neighbors.
    assert_true(np.all(np.less_equal(sorted_dists_exact,
                                     sorted_dists_approx)))
                                         algorithm='auto',
                                         n_jobs=-1)
NearestNeighborsModel.fit(X_train)
#----------------------------------------------------
#Calculating Details
print('The distance metric to use is : ',
      NearestNeighborsModel.effective_metric_)
print('Additional keyword arguments for the metric function is : ',
      NearestNeighborsModel.effective_metric_params_)
print('Number of samples in the fitted data is : ',
      NearestNeighborsModel.n_samples_fit_)
print("=" * 10)
print('NearestNeighborsModel Train kneighbors are : ',
      NearestNeighborsModel.kneighbors(X_train[:1]))
print('NearestNeighborsModel Train radius kneighbors are : ',
      NearestNeighborsModel.radius_neighbors(X_train[:1]))
print("=" * 10)
print('NearestNeighborsModel Test kneighbors are : ',
      NearestNeighborsModel.kneighbors(X_test[:1]))
print('NearestNeighborsModel Test  radius kneighbors are : ',
      NearestNeighborsModel.radius_neighbors(X_test[:1]))
print("=" * 25)
#----------------------------------------------------
plt.figure("data")
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, alpha=1, palette=['r', 'b', 'k'])
sns.scatterplot(x=centers[:, 0],
                y=centers[:, 1],
                s=75,
                color="yellow",
                label="Centers")
plt.show(block=False)
Example #43
0
	id,x1,y1,m,x2,y2 = np.genfromtxt('%s.dat'%(i+1),unpack=True,usecols=(0,3,4,5,9,10))
	mask			 = (m<15)*(m>11)
	id,x1,y1,m,x2,y2   = np.transpose([id,x1,y1,m,x2,y2])[mask].T

	epinbu = np.in1d(id,bid)
	buinep = np.in1d(bid,id)

	print (id[epinbu]==bid[buinep]).sum(),id.size

	epxy = np.transpose([x2,y2])[epinbu]
	epm  = m[epinbu]

	nbrs = NN(n_neighbors=vecinos, algorithm='auto').fit(epxy)

	if radio:
		dist, idx = nbrs.radius_neighbors(np.transpose([x2,y2]),radius=400)
		nbors 	  = np.array([len(d) for d in dist])

		mednbors, minnbors = np.median(nbors),nbors.min()

		for j in range(len(dist)):
			#Elimina la misma estrella
			msk	    = dist[j]>300
			dist[j] = dist[j][msk]
			idx[j]  = idx[j][msk]


			#Toma las 50 mas brillantes
			'''
			if len(dist[j])>15:
				midx = np.argsort(epm[idx[j]])[:15]
Example #44
0
def pms(substack, args):
    """Find cells using mean shift.

    In this version, the substack is split into eight patches.

    Parameters
    ----------
    substack : object
        :class:`bcfind.volume.SubStack` object representing the substack to be analyzed.
    args : object
        :py:class:`argparse.Namespace` object containing the
        arguments passed to the find_cells script, in particular
        - args.outdir: directory where results are saved
        - args.hi_local_max_radius: radius of the sphere used to decide whether a local maximum should be a seed
        - args.mean_shift_bandwidth: bandwidth for the mean shift algorithm
        - args.floating_point: bool, whether cell coordinates should be rounded before saving
    """
    D = substack.info['Depth']
    W = substack.info['Width']
    H = substack.info['Height']
    M = 20
    patch = np.zeros((W,H,D))
    for z in range(D):
        patch[:,:,z] = np.array(substack.imgs[z]).T
    slicesx = [slice(0, W/2+M), slice(W/2-M,W)]
    slicesy = [slice(0, H/2+M), slice(H/2-M,H)]
    slicesz = [slice(0, D/2+M), slice(D/2-M,D)]
    cluster_centers = np.zeros((0,3))
    cluster_masses = np.zeros(0)
    L = np.zeros((0,3))
    labels = np.zeros(0)
    seeds = []
    counter = 0
    for sx in slicesx:
        for sy in slicesy:
            for sz in slicesz:
                counter += 1
                tee.log('%d/8:' % counter, 'Analyzing minisubstack',sx,sy,sz)
                rval = _patch_ms(patch[sx,sy,sz], args)
                origin = [sx.start,sy.start,sz.start]
                if rval is not None:
                    cluster_centers = np.concatenate((cluster_centers, rval.cluster_centers + origin))
                    cluster_masses = np.concatenate((cluster_masses, rval.masses))
                    labels = np.concatenate((labels, rval.labels+len(rval.cluster_centers)))
                    L = np.concatenate((L,rval.L+origin))
                    for c in rval.seeds:
                        c.x += origin[0]
                        c.y += origin[1]
                        c.z += origin[2]
                    seeds.extend(rval.seeds)
    if len(cluster_centers) > 0:
        # remove near duplicate points (because of overlapping margins)
        indices = np.argsort(cluster_masses)
        sorted_centers = cluster_centers[indices]
        sorted_masses = cluster_masses[indices]
        # sorted_volumes = volumes[indices]
        unique = np.ones(len(sorted_centers), dtype=np.bool)
        # FIXME - make it a parameter
        nbrs = NearestNeighbors(radius=5.5).fit(sorted_centers)
        for i, center in enumerate(sorted_centers):
            if unique[i]:
                neighbor_idxs = nbrs.radius_neighbors([center],
                                                      return_distance=False)[0]
                unique[neighbor_idxs] = 0
                unique[i] = 1  # leave the current point as unique
        cluster_centers = sorted_centers[unique]
        masses = sorted_masses[unique]
        masses_mean = np.mean(masses)
        masses_std = np.std(masses)
        # volumes = sorted_volumes[unique]
    C = []
    for i, cc in enumerate(cluster_centers):
        c = volume.Center(cc[0], cc[1], cc[2])
        c.name = 'MS_center %d' % i
        c.volume = (masses[i]-masses_mean)/masses_std  # volumes[i]
        c.mass = masses[i]
        tee.log(i, cc, c)
        C.append(c)
    
    filename = args.outdir+'/ms.marker'
    substack.save_markers(filename, C, floating_point=args.floating_point)
    tee.log('Markers saved to', filename)
    filename = args.outdir+'/seeds.marker'
    substack.save_markers(filename, seeds)
    tee.log(len(seeds), 'seeds saved to', filename)

    up_outdir=dirname(abspath(args.outdir))
    if args.save_image:
        image_saver = volume.ImageSaver(up_outdir, substack, C)
        Lx = [int(x) for x in L[:,0]]
        Ly = [int(y) for y in L[:,1]]
        Lz = [int(z) for z in L[:,2]]
        image_saver.save_above_threshold(Lx, Ly, Lz)

        tee.log('Debugging images saved in',up_outdir)
    else:
        tee.log('Debugging images not saved')
    def plan_cached_rrt(self,cached_points):

        """
        Plan cached RRT
        """
        print "Planning a cached RRT*"
        marker_points = MarkerArray()
        vol_freecells = len(self._freecells)*self._navmap.info.resolution**2
        print "FREE CELL VOLUME", vol_freecells
        gamma_rrg = 2*sqrt(1.5*vol_freecells/pi)

        probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y,2*np.arccos(self._robot_pose.pose.orientation.w)])
        nbrs = NearestNeighbors(n_neighbors=1,algorithm="kd_tree",leaf_size = 30)
        # V is a list of edges. E is a disctionary where the key is connected with its values.
        # parents is  a disctionary where parent of key is value. Since is a key, each node has only one parent
        V = [probot]
        #V_xy = [probot[:2]]
        p4dist = np.zeros(4)
        p4dist[:2] = probot[:2];p4dist[2] = np.cos(probot[2]);p4dist[3] = np.sin(probot[2])
        V_xy = [p4dist]
        E = {}
        parents = {}
        Dist = [0.0]

        # C stores the cost at vertex idx which is hte sum of the edges going to it.
        goal_xy = np.array([self._goal.pose.position.x,self._goal.pose.position.y,2*np.arccos(self._goal.pose.orientation.w)])
        c_init = self.cost_manager.get_cost(probot[:2],goal_xy[:2])
        edge_C = {}
        #flann = FLANN()
        lowest_cost_idx = None
        #params = flann.build_index(np.array(V))
        #pdb.set_trace()
        t1 = time.time()

        planning_done = False
        rrt_iter = 0
        while not planning_done:
            t2 = time.time()
            """
            Sampling new point
            """
            cached = cached_points[rrt_iter]
            prand = cached["prand"]
            pnearest_idx = cached["pnearest_idx"]
            pnearest = V[pnearest_idx]
            """
            Turning new point into reachable point
            """
            pnew =cached["pnew"]
            path_new = cached["path_new"]
            Pnear_idx = cached["Pnear_idx"]
            pmin_idx = pnearest_idx

            cum_c = self.integrate_costs(edge_C,parents,pnearest_idx)
            min_edge_c = self.cost_manager.path_cost(path_new,goal_xy[:2])
            cmin = cum_c +min_edge_c
            cumulative_costs = {}
            Pnear_fwd = cached["Pnear_forward"]
            for num,p_idx in enumerate(Pnear_fwd):
                p = V_xy[p_idx]
                p_xyz = V[p_idx]
                cum_cost = self.integrate_costs(edge_C,parents,p_idx)
                cumulative_costs[p_idx] = cum_cost
                p_idx_path = cached["pnear_pnew"][num]["path"]
                reached = cached["pnear_pnew"][num]["reached"]
                safe = cached["pnear_pnew"][num]["safe"]
                if reached is True and safe is True:
                    path_c = self.cost_manager.path_cost(p_idx_path,goal_xy[:2])
                else:
                    path_c = 0
                #reached = False
                c = cum_cost + path_c
                if (safe is True and
                    reached is True and c < cmin):
                    cmin = c
                    min_edge_c = path_c
                    pmin_idx = p_idx      

            if E.has_key(pmin_idx):
                E[pmin_idx].add(len(V))
            else:
                E[pmin_idx] = set([len(V)])   
            edge_C[pmin_idx,len(V)] = min_edge_c  
            cumulative_last = cmin     
            pnew_idx = len(V)
            V.append(pnew)
            p4dist = np.zeros(4)
            p4dist[:2] = pnew[:2];p4dist[2] = np.cos(pnew[2]);p4dist[3] = np.sin(pnew[2])
            V_xy.append(p4dist)
            parents[pnew_idx] = pmin_idx
            """
            Re-wire the tree
            """
            unsafe = 0
            Pnear_bwd = cached["Pnear_backward"]
            for en,p_idx in enumerate(Pnear_bwd):
                if parents.has_key(p_idx):
                    if not cumulative_costs.has_key(p_idx):
                        cumulative_costs[p_idx] = self.integrate_costs(edge_C,parents,p_idx)
                    p_xyz = V[p_idx]
                    rewire_path = cached["pnew_pnear"][en]["path"]
                    rewire_reached = cached["pnew_pnear"][en]["reached"]
                    rewire_safe = cached["pnew_pnear"][en]["safe"]
                    #rewire_path,rewire_reached = posq.simulate(pnew,p_xyz,steps = int(stp))
                    if rewire_reached is True and rewire_safe is True :
                        rewire_path_c = self.cost_manager.path_cost(rewire_path,goal_xy[:2])
                    else:
                        rewire_path_c = 0
                    c = cumulative_last + rewire_path_c
                    if (rewire_safe is True and c < cumulative_costs[p_idx] and rewire_reached is True):
                        E[parents[p_idx]].remove(p_idx)
                        edge_C.pop(parents[p_idx],p_idx)
                        edge_C[pnew_idx,p_idx] = rewire_path_c
                        parents[p_idx] = pnew_idx
                        if E.has_key(pnew_idx):
                            E[pnew_idx].add(p_idx)
                        else:
                            E[pnew_idx] = set([p_idx])
            rrt_iter +=1
            if rrt_iter==len(cached_points):
                planning_done=True
                nbrs.fit(V_xy)
                p4dist = np.zeros(4)
                p4dist[:2] = goal_xy[:2];p4dist[2] = np.cos(goal_xy[2]);p4dist[3] = np.sin(goal_xy[2])
                points_near_goal = []
                add = 0
                while len(points_near_goal)==0:
                    dist,points_near_goal = nbrs.radius_neighbors(p4dist, self.goal_tolerance+add, return_distance = True)
                    points_near_goal = points_near_goal[0]
                    add +=0.1
                print "DONE PLANNING"
                print "TIME TAKEN",time.time()-t1
                print "POINTS NEAR GOAL",points_near_goal

        #self.samp_point_pub.publish(marker_points)
        """
        Find best path:
        """
        min_cost = None;
        for i in points_near_goal:
            c_path = self.integrate_costs(edge_C,parents,i)
            if c_path < min_cost or min_cost==None:
                m = i
                min_cost = c_path
        self.publish_rrt(V,E)   
        print "MINIMUM PATH COST RRT",min_cost
        path = self.get_path(parents,V,m)
        pt = path_to_pose(path)            
        print 'total time: ', time.time()-t1 
        self._path_pub.publish(pt)
        return pt,path
def detect_and_filter_keypoints(im_gray, corner_type='HARRIS'):
    
    image = cv2.cvtColor(im_gray, cv2.COLOR_GRAY2BGR)
    image = cv2.GaussianBlur(image, (5, 5), 0)
    corners = None
    if corner_type is 'HARRIS':
        corners = cv2.goodFeaturesToTrack(im_gray, 400, 0.005, 5, useHarrisDetector=True)
        if corners is None:
            return
        corners = np.int0(corners)
    elif corner_type is 'HOUGH_LINES':
        im_edge = cv2.Canny(im_gray, 100, 200)
        corners = detect_lines(im_edge, im_gray)

    temp_corners = []
    ground_z = 0.0
    for i in corners:
        x,y = i.ravel()
        
        a00 = x * proj_matrix_[2, 0] - proj_matrix_[0, 0]
        a01 = x * proj_matrix_[2, 1] - proj_matrix_[0, 1]
        a10 = y * proj_matrix_[2, 0] - proj_matrix_[1, 0]
        a11 = y * proj_matrix_[2, 1] - proj_matrix_[1, 1]
        bv0 = proj_matrix_[0, 2] * ground_z + proj_matrix_[0, 3] -  \
              x * proj_matrix_[2, 2] * ground_z - x * proj_matrix_[2, 3]
        bv1 = proj_matrix_[1, 2] * ground_z + proj_matrix_[1, 3] -  \
              y * proj_matrix_[2, 2] * ground_z - y * proj_matrix_[2, 3]
        partition = a11 * a00 - a01 * a10
        pos_x = (a11 * bv0 - a01 * bv1) / partition
        pos_y = (a00 * bv1 - a10 * bv0) / partition
        
        temp_corners.append((pos_x, pos_y, ground_z))
        #temp_corners.append((x,y))
        cv2.circle(image, (x, y), 3, (0, 0, 255), -1)
        

    #find the neigbours
    radius_thresh = 6.0
    #n_neighbors = 4,
    nearest_neigb = NearestNeighbors(radius = radius_thresh, \
                                     algorithm="kd_tree", \
                                     leaf_size = 30, \
                                     metric = 'euclidean').fit(np.array(temp_corners))
    #distances, indices = nearest_neigb.kneighbors(np.array(temp_corners))
    distances, indices = nearest_neigb.radius_neighbors(np.array(temp_corners))    
    
    

    temp_im = image.copy()
    possible_candidate = []
    for distance, index in zip(distances, indices):  # fix to not repeat on done array        
        centeroid_index = -1
        for dist, idx in zip(distance, index):
            if dist == 0:
                centeroid_index = idx
                break
                
        #print "\033[34m CENTROID: \033[0m", centeroid_index
        #cv2.circle(image, (corners[centeroid_index][0][0],\
        #                   corners[centeroid_index][0][1]), 5, (0, 255, 0), 2)
        #print "INDEX: ", index, "\t", np.average(distance), "\t", distance

        #avg_dist = np.average(distance)
        #if (avg_dist > 2.5 and avg_dist < 5):
        for dist, idx in zip(distance, index):
            angle = vector_angle(temp_corners[centeroid_index], temp_corners[idx]) * (180.0/np.pi)
            if (dist > (radius_thresh/2) and dist < radius_thresh) and(angle > 75 and angle < 180):
                possible_candidate.append(idx)                

        #     print "ANGLE: ", angle, "\t", dist, "\t", idx
        #     cv2.circle(image, (corners[idx][0][0], corners[idx][0][1]), 3, (255, 255, 0), -1)
        #     plot_image("edge1", image)
        #     cv2.waitKey(0)

        # image = temp_im.copy()


    #print (possible_candidate)
    
    centroid_point = []
    if len(possible_candidate) > 3:
        aver_x = 0
        aver_y = 0
        for pc in possible_candidate:
            cv2.circle(image, (corners[pc][0][0], corners[pc][0][1]), 4, (255, 255, 0), 1)
            aver_x += corners[pc][0][0]
            aver_y += corners[pc][0][1]
        aver_x /= len(possible_candidate)
        aver_y /= len(possible_candidate)
        centroid_point.append((aver_x, aver_y))
        
        cv2.circle(image, (aver_x, aver_y), 7, (0, 255, 0), -1)
    plot_image("edge1", image)
    
    return (np.array(possible_candidate), np.array(centroid_point))
Example #47
0
def continuity_filter(wood, leaf, rad=0.05):
    """
    Function to apply a continuity filter to a point cloud that contains gaps
    defined as points from a second point cloud.
    This function works assuming that the continuous variable is the
    wood portion of a tree point cloud and the gaps in it are empty space
    or missclassified leaf data. In this sense, this function tries to correct
    gaps where leaf points are present.

    Args:
        wood (array): Wood point cloud to be filtered.
        leaf (array): Leaf point cloud, with points that may be causing
            discontinuities in the wood point cloud.
        rad (float): Radius to search for neighboring points in the iterative
            process.

    Returns:
        wood (array): Filtered wood point cloud.
        not_wood (array): Remaining point clouds after the filtering.

    """

    # Stacking wood and leaf arrays.
    arr = np.vstack((wood, leaf))

    # Obtaining wood point cloud indices.
    wood_id = np.arange(wood.shape[0])

    # Calculating shortest path graph over sampled array.
    G = array_to_graph(arr, 0, 3, 100, 0.05, 0.02, 0.5)
    _, dist = extract_path_info(G, 0, return_path=False)

    # Generating nearest neighbors search for the entire point cloud (arr).
    nbrs = NearestNeighbors(algorithm='kd_tree', leaf_size=10,
                            n_jobs=-1).fit(arr)

    # Converting dist variable to array, as it is originaly a list.
    dist = np.asarray(dist)

    # Selecting points and accummulated distance for all wood points in arr.
    gp = arr[wood_id]
    d = dist[wood_id]

    # Preparing control variables to iterate over. idbase will be all initial
    # wood ids and pts all initial wood points. These variables are the ones
    # to use in search of possible missclassified neighbors.
    idbase = wood_id
    pts = gp

    # Setting treshold variables to iterative process.
    e = 9999999
    e_threshold = 3

    # Iterating until threshold is met.
    while e > e_threshold:

        # Obtaining the neighbor indices of current set of points (pts).
        idx2 = nbrs.radius_neighbors(pts, radius=rad, return_distance=False)

        # Initializing temporary variable id1.
        id1 = []
        # Looping over nn search indices and comparing their respective
        # distances to center point distance. If nearest neighbor distance (to
        # point cloud base) is smaller than center point distance, then ith
        # point is also wood.
        for i in range(idx2.shape[0]):
            for i_ in idx2[i]:
                if dist[i_] <= (d[i]):
                    id1.append(i_)

        # Uniquifying id1.
        id1 = np.unique(id1)

        # Comparing original idbase to new wood ids (id1).
        comp = np.in1d(id1, idbase)

        # Maintaining only new ids for next iteration.
        diff = id1[np.where(~comp)[0]]
        idbase = np.unique(np.hstack((idbase, id1)))

        # Passing new wood points to pts and recalculating e value.
        pts = arr[diff]
        e = pts.shape[0]

        # Passing accummulated distances from new points to d.
        d = dist[diff]

        # Stacking new points to initial wood points and removing duplicates.
        gp = np.vstack((gp, pts))
        gp = remove_duplicates(gp)

    # Removing duplicates from final wood points and obtaining not_wood points
    # from the difference between final wood points and full point cloud.
    wood = remove_duplicates(gp)
    not_wood = get_diff(wood, arr)

    return wood, not_wood
Example #48
0
    def match_by_neighbor(self, caliper):
        '''
        Performs propensity score matching.

        Parameters
        ----------
        caliper : the attribute returned by the set_caliper() function
        Returns
        -------
        matched_controls : Pandas DataFrame
        unmatched: Set
        under_matched: Set
        '''
        ignore_list = set()
        under_matched = set()
        unmatched = set()
        matched_controls = pd.DataFrame()

        ratio = self.k
        data = self.df

        # convert data type
        data.INDEX_DATE = pd.to_datetime(data.INDEX_DATE)
        data.CASE = data.CASE.astype(int)

        controls = data[data.CASE == 0]
        cases = data[data.CASE == 1]
        
        neigh = NearestNeighbors(radius=caliper, algorithm='ball_tree', n_jobs=1)
        neigh.fit(controls[['PSCORE']])

        # calculate time
        i = 1
        total_cases = cases.shape[0]
        start = timeit.default_timer()
        
        #loop through each case
        for index, case in cases.iterrows():
            
            # case index date
            case_indexdate = cases[cases.PATID == case['PATID']].INDEX_DATE.values[0]
            
            # current case's pscore
            pscore = case.PSCORE
            
            # find all controls with pscore within the caliper distance
            distances, indices = neigh.radius_neighbors([[pscore]])
            
            sample = controls.iloc[indices[0]]
            
            # pick out those that have NOT been used
            sample = sample[~sample['PATID'].isin(ignore_list)].copy()
            
            ## verify index date for control
            sample['INDEX_DATE_GAP'] = abs(sample.INDEX_DATE - case_indexdate) / np.timedelta64(1, 'D')
            sample = sample[sample.INDEX_DATE_GAP <= self.gap].sort_values(by=['PATID', 'INDEX_DATE_GAP'])
            
            # rank the samples by their distances to the case's pscore
            sample['DIST'] = abs(sample['PSCORE']-pscore)
            sample.sort_values(by='DIST', ascending=True, inplace=True)
            
            # picked the closest "ratio"
            sample = sample.head(ratio).copy().reset_index(drop=True)
            
            if (sample.shape[0] < ratio and sample.shape[0] != 0):
                under_matched.add(case['PATID'])
            if (sample.shape[0] == 0):
                unmatched.add(case['PATID'])
                
            # exclude the selected sample from the matching pool (i.e., without replacement)
            ignore_list.update(sample['PATID'])
            
            sample['MATCHED_CASE'] = case['PATID']
            sample['MATCHED_CASE_INDEX_DATE'] = case_indexdate
            
            matched_controls = matched_controls.append(sample, ignore_index=True)
            
            # track progress
            stop = timeit.default_timer()
            
            print("Current progress:", np.round(i/total_cases * 100, 2), "%")
            print("Current run time:", np.round((stop - start) / 60, 2), "min")
            
            i = i+1

        matched_controls = matched_controls.reset_index(drop=True)
        cases = find_case(data, unmatched)

        self.matched_controls = matched_controls
        self.unmatched = unmatched

        write_matched_data(self.path, cases, matched_controls)

        return under_matched, unmatched, matched_controls
Example #49
0
import numpy as np
from sklearn.neighbors import NearestNeighbors
'''
NearestNeighbors(n_neighbors=5, radius=1.0, algorithm=’auto’,
                 leaf_size=30, metric=’minkowski’, p=2, metric_
                 params=None, n_jobs=None, **kwargs)
'''
data = np.array([[1, 1], [1, 2], [2, 1], [2, 3], [1, 5], [6, 8], [7, 9],
                 [6, 9], [8, 8], [8, 10], [14, 1], [14, 2], [15, 1], [15, 3]])

nearest_neighbors_model = NearestNeighbors(n_neighbors=5,
                                           algorithm='auto',
                                           radius=1.0)
nearest_neighbors_model.fit(data)

#Calculating Details
print('NearestNeighborsModel kneighbors are : ',
      nearest_neighbors_model.kneighbors(data))
print("-------------------------------------------------------")
print('NearestNeighborsModel radius kneighbors are : ',
      nearest_neighbors_model.radius_neighbors(data))
 def get_neighbors(self, pts, radius):
     pts = self.transform_points(pts)
     radius = radius * self.pixels_per_meter
     nn = NearestNeighbors(n_neighbors=5, radius=radius, p=2)
     nn.fit(pts)
     return nn.radius_neighbors(pts, return_distance=False)
def mean_shift_cosine(X,
                      bandwidth=None,
                      seeds=None,
                      cluster_all=True,
                      GPU=True):
    """Perform mean shift clustering of data using a flat kernel.

    Read more in the :ref:`User Guide <mean_shift>`.

    Parameters
    ----------

    X : array-like, shape=[n_samples, n_features]
        Input data.

    bandwidth : float, optional
        Kernel bandwidth.

        If bandwidth is not given, it is determined using a heuristic based on
        the median of all pairwise distances. This will take quadratic time in
        the number of samples. The sklearn.cluster.estimate_bandwidth function
        can be used to do this more efficiently.

    seeds : array-like, shape=[n_seeds, n_features] or None
        Point used as initial kernel locations. 

    cluster_all : boolean, default True
        If true, then all points are clustered, even those orphans that are
        not within any kernel. Orphans are assigned to the nearest kernel.
        If false, then orphans are given cluster label -1.

    GPU : bool, default True
        Using GPU-based faster mean-shift


    Returns
    -------

    cluster_centers : array, shape=[n_clusters, n_features]
        Coordinates of cluster centers.

    labels : array, shape=[n_samples]
        Cluster labels for each point.


    """

    if bandwidth is None:
        bandwidth = estimate_bandwidth(X)
    elif bandwidth <= 0:
        raise ValueError("bandwidth needs to be greater than zero or None,\
            got %f" % bandwidth)
    if seeds is None:
        if GPU == True:
            seeds = gpu_seed_generator(X)

    #adjusted=False
    n_samples, n_features = X.shape
    center_intensity_dict = {}
    nbrs = NearestNeighbors(radius=bandwidth, metric='cosine').fit(X)
    #NearestNeighbors(radius=bandwidth, n_jobs=n_jobs, metric='cosine').radius_neighbors()

    global SEED_NUM
    if GPU == True:
        #GPU ver
        while True:
            labels, number = meanshift_torch(X, seeds,
                                             bandwidth)  #gpu calculation
            for i in range(len(number)):
                if number[i] is not None:
                    center_intensity_dict[tuple(
                        labels[i])] = number[i]  #find out cluster

            if not center_intensity_dict:
                # nothing near seeds
                raise ValueError(
                    "No point was within bandwidth=%f of any seed."
                    " Try a different seeding strategy \
                             or increase the bandwidth." % bandwidth)

            # POST PROCESSING: remove near duplicate points
            # If the distance between two kernels is less than the bandwidth,
            # then we have to remove one because it is a duplicate. Remove the
            # one with fewer points.

            sorted_by_intensity = sorted(center_intensity_dict.items(),
                                         key=lambda tup: (tup[1], tup[0]),
                                         reverse=True)
            sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
            unique = np.ones(len(sorted_centers), dtype=np.bool)
            nbrs = NearestNeighbors(radius=bandwidth,
                                    metric='cosine').fit(sorted_centers)
            for i, center in enumerate(sorted_centers):
                if unique[i]:
                    neighbor_idxs = nbrs.radius_neighbors(
                        [center], return_distance=False)[0]
                    unique[neighbor_idxs] = 0
                    unique[i] = 1  # leave the current point as unique
            cluster_centers = sorted_centers[unique]

            # assign labels
            nbrs = NearestNeighbors(n_neighbors=1,
                                    metric='cosine').fit(cluster_centers)
            labels = np.zeros(n_samples, dtype=np.int)
            distances, idxs = nbrs.kneighbors(X)
            if cluster_all:
                labels = idxs.flatten()
            else:
                labels.fill(-1)
                bool_selector = distances.flatten() <= bandwidth
                labels[bool_selector] = idxs.flatten()[bool_selector]

            #Test
            #break

            bg_num = np.sum(labels == 0)
            r = 1 - bg_num / labels.size
            #seed number adjust
            dict_len = len(cluster_centers)  #cluster number

            N = get_N(0.95, r, dict_len)

            if L * N <= SEED_NUM:  #safety area
                #SEED_NUM -= 200#test
                if H * N <= SEED_NUM:
                    SEED_NUM -= N  #seeds are too much, adjsut

                break
            else:
                seeds = gpu_seed_adjust(X)  #seeds are too few, adjsut

        return cluster_centers, labels
Example #52
0
def mean_shift(X, intensities=None, bandwidth=None, seeds=None,
               cluster_all=True, max_iterations=300, verbose=False, use_scipy=True):
    """mean_shift(X, intensities=None, bandwidth=None, seeds=None,
                  cluster_all=True, max_iterations=300, verbose=False, use_scipy=True)

    Mean shift algorithm

    Implementation taken from scikit-learn with two minor variants:

        - Use (by default) scipy KD-trees, which are faster in our case
        - weigthed version of mean-shift using `intensities` as
          weights (i.e., we compute centers of mass rather than means)

    Parameters
    ----------

    X : array-like, shape=[n_samples, n_features]
        Input data.

    intensities : array-like, shape=[n_samples]
        Voxel intensities, used to weight the mean

    bandwidth : float
        Kernel bandwidth.

    seeds : array-like, shape=[n_seeds, n_features]
        Point used as initial kernel locations.

    use_scipy : bool
        If true use cKDTree from scipy.spatial, otherwise
        use NearestNeighbors from sklearn.neighbors

    Returns
    -------

    cluster_centers : array, shape=[n_clusters, n_features]
        Coordinates of cluster centers.

    labels : array, shape=[n_samples]
        Cluster labels for each point.

    volumes : array, shape=[n_clusters]
        Volume of each cluster (# of points in the cluster)

    masses : array, shape=[n_clusters]
        Mass of each cluster (sum of intensities of points in the cluster).

    trajectories : list
        MS trajectories for debugging purposes.
    """
    if seeds is None:
        seeds = X
    n_points, n_features = X.shape
    stop_thresh = 1e-3 * bandwidth  # when mean has converged
    center_volume_dict = {}
    center_mass_dict = {}
    # tee.log('Fitting NearestNeighbors on', n_points, 'points')
    if use_scipy:
        kdtree = cKDTree(X)
    else:
        nbrs = NearestNeighbors(radius=bandwidth).fit(X)

    # For each seed, climb gradient until convergence or max_iterations
    trajectories = {}  # for each seed, a list of points
    tee.log('Moving kernels for', len(seeds), 'seeds')
    pbar = pb.ProgressBar(widgets=['Moving %d seeds: ' % len(seeds), pb.Percentage()],
                          maxval=len(seeds)).start()
    for seed_no, my_mean in enumerate(seeds):
        completed_iterations = 0
        seed = my_mean
        trajectories[seed_no] = []
        while True:
            # Find mean of points within bandwidth
            if use_scipy:
                i_nbrs = kdtree.query_ball_point(my_mean, r=bandwidth)
            else:
                i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth,
                                               return_distance=False)[0]
            points_within = X[i_nbrs]
            if len(points_within) == 0:
                break  # Depending on seeding strategy this condition may occur
            my_old_mean = my_mean  # save the old mean
            if intensities is None:
                my_mean = np.mean(points_within, axis=0)
            else:
                my_mean = np.average(points_within, axis=0, weights=intensities[i_nbrs])
            # If converged or at max_iterations, addS the cluster
            if extmath.norm(my_mean - my_old_mean) < stop_thresh or completed_iterations == max_iterations:
                center_volume_dict[tuple(my_mean)] = len(points_within)
                center_mass_dict[tuple(my_mean)] = sum(intensities[i_nbrs])
                break
            completed_iterations += 1
            trajectories[seed_no].append(my_mean)
        if verbose:
            print('seed', seed, '-->', my_mean,
                  center_volume_dict[tuple(my_mean)], center_mass_dict[tuple(my_mean)], completed_iterations)

        pbar.update(seed_no+1)
    pbar.finish()
    # POST PROCESSING: remove near duplicate points
    # If the distance between two kernels is less than the bandwidth,
    # then we have to remove one because it is a duplicate. Remove the
    # one with fewer points.
    sorted_by_intensity = sorted(center_mass_dict.items(),
                                 key=lambda tup: tup[1], reverse=True)
    sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
    unique = np.ones(len(sorted_centers), dtype=np.bool)
    print('started from', len(seeds), 'seeds, now |unique|=', len(unique))
    # print('|center_mass_dict|=', len(center_mass_dict))
    if len(center_mass_dict) == 0:
        tee.log('No valid seeds. Giving up')
        return None, None, None, None, None

    nbrs = NearestNeighbors(radius=bandwidth).fit(sorted_centers)
    for i, center in enumerate(sorted_centers):
        if unique[i]:
            neighbor_idxs = nbrs.radius_neighbors([center],
                                                  return_distance=False)[0]
            unique[neighbor_idxs] = 0
            unique[i] = 1  # leave the current point as unique
    cluster_centers = sorted_centers[unique]
    print('|cluster_centers|=', len(cluster_centers))
    volumes = [0]*len(cluster_centers)
    masses = [0]*len(cluster_centers)
    for i, c in enumerate(cluster_centers):
        volumes[i] = center_volume_dict[tuple(c)]
        masses[i] = center_mass_dict[tuple(c)]
    # ASSIGN LABELS: a point belongs to the cluster that it is closest to
    nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
    labels = np.zeros(n_points, dtype=np.int)
    distances, idxs = nbrs.kneighbors(X)
    if cluster_all:
        labels = idxs.flatten()
    else:
        labels[:] = -1
        bool_selector = distances.flatten() <= bandwidth
        labels[bool_selector] = idxs.flatten()[bool_selector]
    return cluster_centers, labels, volumes, masses, trajectories
Example #53
0
import numpy as np
from sklearn.neighbors import NearestNeighbors
samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]
neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
neigh.fit(samples)

neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)

nbrs = neigh.radius_neighbors([[0, 0, 1.3]], 0.4, return_distance=False)
np.asarray(nbrs[0][0])
    def fit(self, X, y=None, sample_weight=None):
        """Perform common-nearest-neighbor clustering

        Cluster from features, or distance matrix.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape
            (n_samples, n_features), or (n_samples, n_samples)
            Training instances to cluster, or distances between
            instances if `metric='precomputed'`.
            If a sparse matrix is provided, it will
            be converted into a sparse `csr_matrix`.

        sample_weight : array-like of shape (n_samples,), default=None
            Weight of each sample.  Note, that this option is not
            fully supported at the moment.

        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self

        """

        if LooseVersion(sklearn.__version__) < LooseVersion("0.23.0"):
            X = check_array(X, accept_sparse="csr")
        else:
            X = self._validate_data(X, accept_sparse="csr")

        if not self.eps > 0.0:
            raise ValueError("eps must be positive.")

        if sample_weight is not None:
            warnings.warn("Sample weights are not fully supported, yet.",
                          UserWarning)
            if LooseVersion(sklearn.__version__) < LooseVersion("0.23.0"):
                sample_weight = np.asarray(sample_weight)
                check_consistent_length(X, sample_weight)
            else:
                sample_weight = _check_sample_weight(sample_weight, X)

        # Calculate neighborhood for all samples. This leaves the
        # original point in, which needs to be considered later
        # (i.e. point i is in the
        # neighborhood of point i). While True, its useless information
        if self.metric == "precomputed" and sparse.issparse(X):
            # set the diagonal to explicit values, as a point is its own
            # neighbor
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
                X.setdiag(X.diagonal())

        neighbors_model = NearestNeighbors(
            radius=self.eps,
            algorithm=self.algorithm,
            leaf_size=self.leaf_size,
            metric=self.metric,
            metric_params=self.metric_params,
            p=self.p,
            n_jobs=self.n_jobs,
        )
        neighbors_model.fit(X)
        # This has worst case O(n^2) memory complexity
        neighborhoods = neighbors_model.radius_neighbors(X,
                                                         return_distance=False)

        if sample_weight is None:
            n_neighbors = np.array(
                [len(neighbors) for neighbors in neighborhoods])
        else:
            n_neighbors = np.array([
                np.sum(sample_weight[neighbors]) for neighbors in neighborhoods
            ])

        # Initially, all samples are noise.
        labels = np.full(X.shape[0], -1, dtype=np.intp)

        # Account for self neighbour membership (self.min_samples + 2)
        corrected_min_samples = self.min_samples + 2

        # Array tracking points qualified for similarity check
        core_candidates = np.asarray(n_neighbors >= corrected_min_samples)

        commonnn_inner(neighborhoods, labels, core_candidates,
                       corrected_min_samples)

        self.labels_ = labels

        return self
    def plan_cached_rrt(self,cached_points):
        """
        RRT* Algorithm
        """
        marker_points = MarkerArray()

        vol_freecells = len(self._freecells)*self._navmap.info.resolution**2
        print "FREE CELL VOLUME", vol_freecells
        gamma_rrg = 2*sqrt(1.5*vol_freecells/pi)
        probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y])
        nbrs = NearestNeighbors(n_neighbors=1,algorithm="kd_tree",leaf_size = 30)
        # V is a list of edges. E is a disctionary where the key is connected with its values.
        # parents is  a disctionary where parent of key is value. Since is a key, each node has only one parent
        V = [probot]
        E = {}
        parents = {}
        Dist = [0.0]

        # C stores the cost at vertex idx which is hte sum of the edges going to it.
        goal_xy = np.array([self._goal.pose.position.x,self._goal.pose.position.y])
        c_init = self.cost_manager.get_cost(probot,goal_xy)
        edge_C = {}
        C = [c_init]
        #flann = FLANN()
        lowest_cost_idx = None
        #params = flann.build_index(np.array(V))
        #pdb.set_trace()
        t1 = time.time()

        planning_done = False
        rrt_iter = 0
        while not planning_done:
            t2 = time.time()
            #bias*=1.0025 # increase the goal bias as the RRT progresses
            """
            Sampling new point
            """
            cached = cached_points[rrt_iter]
            prand = cached["prand"]
            pnearest_idx = cached["pnearest_idx"]
            pnearest = V[pnearest_idx]
            #mrk = self.make_sample_marker(prand)
            #marker_points.markers.append(mrk)
            #print len(marker_points.markers)
            #self.samp_point_pub.publish(mrk)

            """
            Turning new point into reachable point
            """
            pnew =cached["pnew"]
            #Pnear_idx,pnear_dist = flann.nn_radius(pnew, r)
            Pnear_idx = cached["Pnear_idx"]
            pmin_idx = pnearest_idx
            c_nearest = C[pnearest_idx]

            c_new =self.cost_manager.get_cost(pnew,goal_xy)
            cum_c = self.integrate_costs(edge_C,parents,pnearest_idx)
            min_edge_c =self.cost_manager.edge_cost(c_nearest,c_new,pnearest,pnew)
            cmin = cum_c +min_edge_c
            #if len(Pnear_idx)>5:
            #   Pnear_idx = Pnear_idx[:5]
            cumulative_costs = []
            for p_idx in Pnear_idx:
                p = V[p_idx]
                c_near = C[p_idx]

                cum_cost = self.integrate_costs(edge_C,parents,p_idx)
                cumulative_costs.append(cum_cost)
                edge_c = self.cost_manager.edge_cost(c_near,c_new,p,pnew)
                c = cum_cost + edge_c

                if (self.segment_safe(p,pnew) is True and 
                    c < cmin):
                    cmin = c
                    min_edge_c = edge_c
                    pmin_idx = p_idx      

            if E.has_key(pmin_idx):
                E[pmin_idx].add(len(V))
            else:
                E[pmin_idx] = set([len(V)])   
            edge_C[pmin_idx,len(V)] = min_edge_c  
            cumulative_last = cmin     
            pnew_idx = len(V)
            V.append(pnew)
            C.append(c_new)
            parents[pnew_idx] = pmin_idx
            """
            Re-wire the tree
            """
            for en,p_idx in enumerate(Pnear_idx):
                # so if the near nodes, have children
                #parent
                if parents.has_key(p_idx):
                    p = V[p_idx]
                    c_near = C[p_idx]
                    e_c = self.cost_manager.edge_cost(c_near,c_new,p,pnew)
                    c = cumulative_last + e_c
                    if (self.segment_safe(p,pnew) is True and 
                        c < cumulative_costs[en]):
                        E[parents[p_idx]].remove(p_idx)
                        edge_C.pop(parents[p_idx],p_idx)
                        edge_C[pnew_idx,p_idx] = e_c
                        parents[p_idx] = pnew_idx
                        if E.has_key(pnew_idx):
                            E[pnew_idx].add(p_idx)
                        else:
                            E[pnew_idx] = set([p_idx])

            rrt_iter +=1
            if rrt_iter==len(cached_points):
                planning_done=True
                nbrs.fit(V)
                dist,points_near_goal = nbrs.radius_neighbors(goal_xy, self.goal_tolerance+0.1, return_distance = True)
                points_near_goal = points_near_goal[0]
                print "DONE PLANNING"
                print "TIME TAKEN",time.time()-t1
                print "POINTS NEAR GOAL",points_near_goal

        #self.samp_point_pub.publish(marker_points)
        """
        Find best path:
        """
        min_cost = None;
        for i in points_near_goal:
            c_path = self.integrate_costs(edge_C,parents,i)
            if c_path < min_cost or min_cost==None:
                m = i
                min_cost = c_path
        path = self.get_path(parents,V,m)
        path = self.smoothing(path)
        pt = path_to_pose(path)            
        print 'total time: ', time.time()-t1
        self.publish_rrt(V,E)        
        self._path_pub.publish(pt)
        return pt,path
Example #56
0
class GlobalKMeans(BaseEstimator, ClusterMixin, TransformerMixin):
    """Global K-means Algorithm

    Paramereters:

    n_clusters: int
        maximum number of clusters to obtain
    algorithm string
        'classical' the classical algorithm
        'bagirov' the Bagirov 2006 variant

    """

    def __init__(self, n_clusters, algorithm='classical'):
        self.n_clusters = n_clusters
        self.cluster_centers_ = None
        self.labels_ = None
        self.cluster_sizes_ = None
        self.inertia_ = None
        self.algorithm = algorithm

    def fit(self, X):
        """
        Clusters the examples
        :param X:
        :return:
        """

        if self.algorithm == 'classical':
            self.cluster_centers_, self.labels_, self.inertia_ = self._fit_process(X)
        elif self.algorithm == 'bagirov':
            self.cluster_centers_, self.labels_, self.inertia_ = self._fit_process_bagirov(X)

        return self

    def predict(self, X):
        """
        Returns the nearest cluster for a data matrix

        @param X:
        @return:
        """
        clasif = []
        for i in range(X.shape[0]):
            ncl, mdist = self._find_nearest_cluster(X[i].reshape(1, -1), self.cluster_centers_)
            if mdist <= self.radius:
                clasif.append(ncl)
            else:
                clasif.append(-1)
        return clasif

    def _fit_process(self, X):
        """
        Classical global k-means algorithm

        :param X:
        :return:
        """

        # Compute the centroid of the dataset
        centroids = sum(X) / X.shape[0]
        centroids.shape = (1, X.shape[1])

        for i in range(2, self.n_clusters + 1):
            mininertia = np.infty
            for j in range(X.shape[0]):
                newcentroids = np.vstack((centroids, X[j]))
                # print newcentroids.shape
                km = KMeans(n_clusters=i, init=newcentroids, n_init=1)
                km.fit(X)
                if mininertia > km.inertia_:
                    mininertia = km.inertia_
                    bestkm = km
            centroids = bestkm.cluster_centers_

        return bestkm.cluster_centers_, bestkm.labels_, bestkm.inertia_

    def _fit_process_bagirov(self, X):
        """
        Clusters using the global K-means algorithm Bagirov variation
        :param X:
        :return:
        """

        # Create a KNN structure for fast search
        self._neighbors = NearestNeighbors()
        self._neighbors.fit(X)

        # Compute the centroid of the dataset
        centroids = sum(X) / X.shape[0]
        assignments = [0 for i in range(X.shape[0])]

        centroids.shape = (1, X.shape[1])

        # compute the distance of the examples to the centroids
        mindist = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            mindist[i] = \
            euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1), squared=True)[0]

        for k in range(2, self.n_clusters + 1):
            newCentroid = self._compute_next_centroid(X, centroids, assignments, mindist)
            centroids = np.vstack((centroids, newCentroid))
            km = KMeans(n_clusters=k, init=centroids, n_init=1)
            km.fit(X)
            assignments = km.labels_
            for i in range(X.shape[0]):
                mindist[i] = \
                euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1), squared=True)[0]

        return km.cluster_centers_, km.labels_, km.inertia_

    def _compute_next_centroid(self, X, centroids, assignments, mindist):
        """
        Computes the candidate for the next centroid

        :param X:
        :param centroids:
        :return:
        """
        minsum = np.infty
        candCentroid = None

        # Compute the first candidate to new centroid
        for i in range(X.shape[0]):
            distance = euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1))[0]
            S2 = self._neighbors.radius_neighbors(X[i].reshape(1, -1), radius=distance, return_distance=False)[0]
            S2centroid = np.sum(X[S2], axis=0) / len(S2)
            S2centroid.shape = (1, X.shape[1])
            cost = self._compute_fk(X, mindist, S2centroid)

            if cost < minsum:
                minsum = cost
                candCentroid = S2centroid

        # Compute examples for the new centroid
        S2 = []
        newDist = euclidean_distances(X, candCentroid.reshape(1, -1), squared=True)
        for i in range(X.shape[0]):
            if newDist[i] < mindist[i]:
                S2.append(i)

        newCentroid = sum(X[S2]) / len(S2)
        newCentroid.shape = (1, X.shape[1])

        while not (candCentroid == newCentroid).all():
            candCentroid = newCentroid
            S2 = []
            newDist = euclidean_distances(X, candCentroid.reshape(1, -1), squared=True)
            for i in range(X.shape[0]):
                if newDist[i] < mindist[i]:
                    S2.append(i)

            newCentroid = np.sum(X[S2], axis=0) / len(S2)
            newCentroid.shape = (1, X.shape[1])

        return candCentroid

    def _compute_fk(self, X, mindist, ccentroid):
        """
        Computes the cost function

        :param X:
        :param mindist:
        :param ccentroid:
        :return:
        """

        # Distances among the examples and the candidate centroid
        centdist = euclidean_distances(X, ccentroid.reshape(1, -1), squared=True)

        fk = 0
        for i in range(X.shape[0]):
            fk = fk + min(mindist[i], centdist[i][0])

        return fk

    @staticmethod
    def _find_nearest_cluster(examp, centers):
        """
        Finds the nearest cluster for an example
        :param examp:
        :param centers:
        :return:
        """

        dist = euclidean_distances(centers, examp.reshape(1, -1))

        pmin = np.argmin(dist)
        vmin = np.min(dist)

        return pmin, vmin
Example #57
0
def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
           algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1):
    """Perform DBSCAN clustering from vector array or distance matrix.
    Read more in the :ref:`User Guide <dbscan>`.
    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.
    eps : float, optional
        The maximum distance between two samples for them to be considered
        as in the same neighborhood.
    min_samples : int, optional
        The number of samples (or total weight) in a neighborhood for a point
        to be considered as a core point. This includes the point itself.
    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
        its metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square. X may be a sparse matrix, in which case only "nonzero"
        elements may be considered neighbors for DBSCAN.
    metric_params : dict, optional
        Additional keyword arguments for the metric function.
        .. versionadded:: 0.19
    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        The algorithm to be used by the NearestNeighbors module
        to compute pointwise distances and find nearest neighbors.
        See NearestNeighbors module documentation for details.
    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or cKDTree. This can affect the speed
        of the construction and query, as well as the memory required
        to store the tree. The optimal value depends
        on the nature of the problem.
    p : float, optional
        The power of the Minkowski metric to be used to calculate distance
        between points.
    sample_weight : array, shape (n_samples,), optional
        Weight of each sample, such that a sample with a weight of at least
        ``min_samples`` is by itself a core sample; a sample with negative
        weight may inhibit its eps-neighbor from being core.
        Note that weights are absolute, and default to 1.
    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for neighbors search.
        If ``-1``, then the number of jobs is set to the number of CPU cores.
    Returns
    -------
    core_samples : array [n_core_samples]
        Indices of core samples.
    labels : array [n_samples]
        Cluster labels for each point.  Noisy samples are given the label -1.
    Notes
    -----
    For an example, see :ref:`examples/cluster/plot_dbscan.py
    <sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
    This implementation bulk-computes all neighborhood queries, which increases
    the memory complexity to O(n.d) where d is the average number of neighbors,
    while original DBSCAN had memory complexity O(n).
    Sparse neighborhoods can be precomputed using
    :func:`NearestNeighbors.radius_neighbors_graph
    <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>`
    with ``mode='distance'``.
    References
    ----------
    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
    Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
    In: Proceedings of the 2nd International Conference on Knowledge Discovery
    and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
    """
    if not eps > 0.0:
        raise ValueError("eps must be positive.")

    X = check_array(X, accept_sparse='csr')
    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight)
        check_consistent_length(X, sample_weight)

    # Calculate neighborhood for all samples. This leaves the original point
    # in, which needs to be considered later (i.e. point i is in the
    # neighborhood of point i. While True, its useless information)
    if metric == 'precomputed' and sparse.issparse(X):
        neighborhoods = np.empty(X.shape[0], dtype=object)
        X.sum_duplicates()  # XXX: modifies X's internals in-place
        X_mask = X.data <= eps
        masked_indices = X.indices.astype(np.intp, copy=False)[X_mask]
        masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))[X.indptr[1:]]

        # insert the diagonal: a point is its own neighbor, but 0 distance
        # means absence from sparse matrix data
        masked_indices = np.insert(masked_indices, masked_indptr,
                                   np.arange(X.shape[0]))
        masked_indptr = masked_indptr[:-1] + np.arange(1, X.shape[0])
        # split into rows
        neighborhoods[:] = np.split(masked_indices, masked_indptr)
    else:
        neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm,
                                           leaf_size=leaf_size,
                                           metric=metric,
                                           metric_params=metric_params, p=p,
                                           n_jobs=n_jobs)
        neighbors_model.fit(X)
        # This has worst case O(n^2) memory complexity
        neighborhoods = neighbors_model.radius_neighbors(X, eps,
                                                         return_distance=False)

    if sample_weight is None:
        n_neighbors = np.array([len(neighbors)
                                for neighbors in neighborhoods])
    else:
        n_neighbors = np.array([np.sum(sample_weight[neighbors])
                                for neighbors in neighborhoods])

    # Initially, all samples are noise.
    labels = -np.ones(X.shape[0], dtype=np.intp)

    # A list of all core samples found.
    core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
    dbscan_inner(core_samples, neighborhoods, labels)
    return np.where(core_samples)[0], labels
Example #58
0
    epxy = np.transpose([rx2,ry2])

    if local:
        #?esta funcion usa el parametro nrefstars?
        nbrs = NN(n_neighbors=nrefstars, algorithm='auto').fit(epxy)

        #coords transformadas. Se crea arreglo de ceros
        ctx = np.zeros(x1.size)
        cty = np.zeros(y1.size)
        nne = np.zeros(x1.size) - 1

        if rad_ext!=0:

            print '\nQuieres %d refstars para transformar cada estrella' %(nrefstars-1)

            dist, nei = nbrs.radius_neighbors(np.transpose([x2,y2]),radius=rad_ext)
            nbors      = np.array([len(d) for d in dist])

            #los nei son tantos arreglos con la posicion (indice) de las refstars
            #como estrellas a analizar
            #si imprimo nei da el indice de las refstars seleccionadas para todas las estrellas
            #print 'nei:', nei
            #print 'nei:', nei.shape
            #print 'nei:', nei[0]
            #print 'refstars a distancia < rad_ext (id) \n', rid1[nei[0]]
            #print 'Numero de refstars encontradas por estrella (r<rad_ext):', nbors

            #los dist son tantos arreglos con las distancias de las refstars como
            #estrellas a analizar
            #?como imprimo las distancias de menor a mayor
            #print 'Distancias de las refstars por estrella:\n', dist[0]
Example #59
0
class SPIDER3(BaseSampler):
    """
    SPIDER3 algorithm implementation for selective preprocessing of multi-class imbalanced data sets.

    Reference:
    Wojciechowski, S., Wilk, S., Stefanowski, J.: An Algorithm for Selective Preprocessing
    of Multi-class Imbalanced Data. Proceedings of the 10th International Conference
    on Computer Recognition Systems CORES 2017
    """
    def __init__(self, k, maj_int_min=None, cost=None):
        """
        :param k:
            Number of nearest neighbors considered while resampling.
        :param maj_int_min:
            Dict that contains lists of majority, intermediate and minority classes labels.
        :param cost:
            The cost matrix. An element c[i, j] of this matrix represents the cost associated with
            misclassifying an example from class i as class one from class j.
        """

        super().__init__()
        self._sampling_type = 'clean-sampling'
        self.k = k
        self.neigh_clf = NearestNeighbors(n_neighbors=self.k)
        self.maj_int_min = maj_int_min
        self.cost = cost
        self.AS, self.RS = np.array([]), np.array([])

    def _fit_resample(self, X, y):
        """
        Performs resampling

        :param X:
            Numpy array of examples that is the subject of resampling.
        :param y:
            Numpy array of labels corresponding to examples from X.
        :return:
            Resampled X along with accordingly modified labels, resampled y
        """
        self._initialize_algorithm(X, y)

        self.DS = np.append(X, y.reshape(y.shape[0], 1), axis=1)
        self._restart_perspective()
        self._calculate_weak_majority_examples()
        self._restore_perspective()
        self.DS = setdiff(self.DS, self.RS)
        int_classes, min_classes = self._sort_by_cardinality(y)

        for int_min_class in int_classes + min_classes:
            self.relabel(int_min_class)
            self.clean(int_min_class)
            self.amplify(int_min_class)

        self.DS = union(self.DS, self.AS)

        return self.DS[:, :-1], self.DS[:, -1]

    def _initialize_algorithm(self, X, y):
        if self.maj_int_min is None:
            self.maj_int_min = construct_maj_int_min(y)
        self.majority_classes = self.maj_int_min['maj']
        self.intermediate_classes = self.maj_int_min['int']
        self.minority_classes = self.maj_int_min['min']

        self.stds, self.means = [1] * X.shape[1], [0] * X.shape[1]
        if self.cost is None:
            self.cost = self._estimate_cost_matrix(y)

    @staticmethod
    def _estimate_cost_matrix(y):
        """
        Method that estimates cost matrix automatically. For example given imbalance ratios of 1:2:6, the estimated
        matrix will be:
        [0 1 1
        2 0 1
        6 3 0]
        :param y:
            labels
        :return:
            cost matrix
        """
        class_cardinality = Counter(y)
        classes = list(class_cardinality.keys())
        cost = np.ones([len(classes), len(classes)])
        for i, (c1, card1) in enumerate(class_cardinality.items()):
            for j, (c2, card2) in enumerate(class_cardinality.items()):
                if j > i:
                    cost[i, j] = 1
                else:
                    cost[i, j] = card1 / card2
        np.fill_diagonal(cost, 0)
        return cost

    def _sort_by_cardinality(self, y):
        class_cardinality = Counter(y)
        # to ensure looping over classes with decreasing cardinality.
        int_classes = sorted(self.intermediate_classes,
                             key=lambda clazz: -class_cardinality[clazz])
        min_classes = sorted(self.minority_classes,
                             key=lambda clazz: -class_cardinality[clazz])
        return int_classes, min_classes

    def amplify(self, int_min_class):
        self._restart_perspective()
        int_min_ds = self.DS[self.DS[:, -1] == int_min_class]
        for x in int_min_ds:
            self._amplify_nn(x)
        self._restore_perspective()

    def clean(self, int_min_class):
        self._restart_perspective()
        int_min_ds = self.DS[self.DS[:, -1] == int_min_class]
        int_min_as = self._calc_int_min_as(int_min_class)
        for x in union(int_min_ds, int_min_as):
            self._clean_nn(x)
        self._restore_perspective()

    def relabel(self, int_min_class):
        self._restart_perspective()
        int_min_ds = self.DS[self.DS[:, -1] == int_min_class]
        for x in int_min_ds:
            self._relabel_nn(x)
        self._restore_perspective()

    def _restart_perspective(self):
        """
        Performs normalization over resampled dataset.
        """
        for col in range(self._ds_as_rs_union().shape[1] - 1):
            self.stds[col] = self._ds_as_rs_union()[:, col].std()
            self.means[col] = self._ds_as_rs_union()[:, col].mean()

        for col in range(self._ds_as_rs_union().shape[1] - 1):
            if self.stds[col] == 0:
                self.stds[col] = 1e-6

        for dataset in [self.DS, self.RS, self.AS]:
            if dataset.shape[0] > 0:
                self._normalize(dataset)

    def _restore_perspective(self):
        """
        Denormalizes for further processing.
        """
        for dataset in [self.DS, self.RS, self.AS]:
            if dataset.shape[0] > 0:
                self._denormalize(dataset)

    def _normalize(self, dataset):
        for col in range(dataset.shape[1] - 1):
            dataset[:, col] = (dataset[:, col] -
                               self.means[col]) / (4 * self.stds[col])

    def _denormalize(self, dataset):
        for col in range(dataset.shape[1] - 1):
            dataset[:,
                    col] = dataset[:,
                                   col] * self.stds[col] * 4 + self.means[col]

    def _calc_int_min_as(self, int_min_class):
        """
        Helper method to calculate examples form AS that belong to int_min_class parameter class.
        :param int_min_class:
            The class name (intermediate or minority).
        :return:
            Examples from AS that are belong to int_min_class.
        """

        if self.AS.size != 0:
            int_min_as = self.AS[self.AS[:, -1] == int_min_class]
        else:
            int_min_as = np.array([])
        return int_min_as

    def _calculate_weak_majority_examples(self):
        """
        Calculates weak majority examples and appends them to the RS set.
        """

        for majority_class in self.majority_classes:
            majority_examples = self.DS[self.DS[:, -1] == majority_class]
            for x in majority_examples:
                if majority_class not in self._min_cost_classes(x, self.DS):
                    self.RS = union(self.RS, np.array([x]))

    def _min_cost_classes(self, x, DS):
        """
        Utility function that aims to identify minimum-cost classes, i.e. classes leading
        to the minimum cost after being (mis)classified as classes appearing in the neighborhood of x.

        :param x:
            Single observation
        :param DS:
            DS
        :return:
            List of classes associated with minimal cost of misclassification.
        """

        C = self.minority_classes + self.intermediate_classes + self.majority_classes
        vals = []
        kneighbors = self._knn(x, DS)

        for cj in C:
            s = 0
            for ci in C:
                s += (
                    (kneighbors[:, -1] == ci).astype(int).sum() /
                    self.k) * self.cost[C.index(ci), C.index(cj)]
            vals.append(s)
        C = np.array(C)
        vals = np.array(vals)
        vals = np.round(vals, 6)
        return C[vals == vals[np.argmin(vals)]]

    def _relabel_nn(self, x):
        """
        Performs relabeling in the nearest neighborhood of x.

        :param x:
            An observation.
        """
        nearest_neighbors = self._knn(x, self._ds_as_rs_union())
        for neighbor in nearest_neighbors:
            if contains(self.RS, neighbor) and self._class_of(
                    neighbor) in self.majority_classes and self._class_of(
                        neighbor) in self._min_cost_classes(
                            x, self._ds_as_rs_union()):
                self.RS = setdiff(self.RS, np.array([neighbor]))
                neighbor[-1] = x[-1]
                self.AS = union(self.AS, np.array([neighbor]))

    def _clean_nn(self, x):
        """
        Performs cleaning in the nearest neighborhood of x.

        :param x:
            Single observation.
        """
        nearest_neighbors = self._knn(x, self._ds_as_rs_union())
        for neighbor in nearest_neighbors:
            if self._class_of(neighbor) in self.majority_classes and \
                    self._class_of(neighbor) in self._min_cost_classes(x, self._ds_as_rs_union()):
                self.DS = setdiff(self.DS, np.array([neighbor]))
                self.RS = setdiff(self.RS, np.array([neighbor]))

    def _knn(self, x, DS):
        """
        Returns k nearest neighbors of x in DS that belong to c class if specified.

        :param x:
            Single observation
        :param DS:
            DS
        :param c:
            Class of neighbors that should be returned.
        :return:
            These neighbors from k nearest that belong to class c if specified. Otherwise all of them.
        """

        DS = setdiff(DS, np.array([x]))
        if DS.shape[0] < self.k:
            self.neigh_clf = NearestNeighbors(n_neighbors=DS.shape[0])
        else:
            self.neigh_clf = NearestNeighbors(n_neighbors=self.k)

        self.neigh_clf.fit(DS[:, :-1])

        within_radius = self.neigh_clf.radius_neighbors(
            [x[:-1]],
            radius=self.neigh_clf.kneighbors([x[:-1]],
                                             return_distance=True)[0][0][-1] +
            0.0001 * self.neigh_clf.kneighbors([x[:-1]],
                                               return_distance=True)[0][0][-1],
            return_distance=True)

        unique_distances = np.unique(sorted(within_radius[0][0]))
        all_distances = within_radius[0][0]
        all_indices = within_radius[1][0]
        indices = []
        for dist in unique_distances:
            if len(indices) < self.k:
                indices += (all_indices[all_distances == dist]).tolist()

        return DS[indices]

    def _amplify_nn(self, x):
        """
        Artificially amplifies example x by adding a copy of it to the AS.

        :param x:
            Single observation.
        """

        while self._class_of(x) not in self._min_cost_classes(
                x, self._ds_as_rs_union()):
            y = x.copy()
            self.AS = union(self.AS, np.asarray([y]))

    @staticmethod
    def _class_of(example):
        return example[-1]

    def _ds_as_rs_union(self):
        return union(self.DS, union(self.AS, self.RS))