Beispiel #1
0
    def run_single_trial(self, train_pairs, test_pairs, train_tune_data, test_tune_data):
        print "Running PCA..."
        train_pairs_pca, test_pairs_pca = self.fit_pca(train_pairs, test_pairs)
        ys = ys_from_pairs(train_pairs_pca)

        file_id = str(random.random())[2:]

        save_cvx_params(ys, file_id)
        run_cvx(file_id)
        M = load_cvx_result(file_id)
        dist = DistanceMetric.get_metric('mahalanobis', VI = M)
        train_a_sections = [x[0] for x in train_pairs_pca]
        train_b_sections = [x[1] for x in train_pairs_pca]
        test_a_sections = [x[0] for x in test_pairs_pca]
        test_b_sections = [x[1] for x in test_pairs_pca]

        train_given_sections = train_a_sections
        train_to_match_sections = train_b_sections
        test_given_sections = test_a_sections
        test_to_match_sections = test_b_sections
        if self.match_a_to_b:
            train_given_sections = train_b_sections
            train_to_match_sections = train_a_sections
            test_given_sections = test_b_sections
            test_to_match_sections = test_a_sections

        print "Constructing BallTrees..."
        train_bt = BallTree(train_to_match_sections, metric=dist)
        test_bt = BallTree(test_to_match_sections, metric=dist)

        train_top_fraction = int(len(train_given_sections) * self.correct_within_top_fraction)
        test_top_fraction = int(len(test_given_sections) * self.correct_within_top_fraction)

        print "Querying the BallTrees..."
        train_result = train_bt.query(train_given_sections, train_top_fraction)
        test_result = test_bt.query(test_given_sections, test_top_fraction)

        print "Looking at correctness of results..."
        train_correct = sum([int(i in train_result[1][i]) for i in xrange(len(train_given_sections))])
        test_correct = sum([int(i in test_result[1][i]) for i in xrange(len(test_given_sections))])

        print "Finding indices of correct matches..."
        test_result_full = test_bt.query(test_given_sections, len(test_given_sections))
        def default_index(lst, i):
          ind = -1
          try:
            ind = lst.index(i)
          except:
            pass
          return ind
        test_indices = [default_index(list(test_result_full[1][i]), i) for i in xrange(len(test_given_sections))]
        test_indices = [x for x in test_indices if x != -1]

        with open("successful_tunes_{}".format(file_id), 'w') as successful_tunes_f:
          for i, index in enumerate(test_indices):
            if index == 0:
              successful_tunes_f.write(str(test_tune_data[i]) + '\n\n')

        return [[train_correct, len(train_given_sections)],
            [test_correct, len(test_given_sections)]], test_indices
Beispiel #2
0
def _compute_nearest(xhs, rr, use_balltree=True, return_dists=False):
    """Find nearest neighbors

    Note: The rows in xhs and rr must all be unit-length vectors, otherwise
    the result will be incorrect.

    Parameters
    ----------
    xhs : array, shape=(n_samples, n_dim)
        Points of data set.
    rr : array, shape=(n_query, n_dim)
        Points to find nearest neighbors for.
    use_balltree : bool
        Use fast BallTree based search from scikit-learn. If scikit-learn
        is not installed it will fall back to the slow brute force search.
    return_dists : bool
        If True, return associated distances.

    Returns
    -------
    nearest : array, shape=(n_query,)
        Index of nearest neighbor in xhs for every point in rr.
    distances : array, shape=(n_query,)
        The distances. Only returned if return_dists is True.
    """
    if use_balltree:
        try:
            from sklearn.neighbors import BallTree
        except ImportError:
            logger.info('Nearest-neighbor searches will be significantly '
                        'faster if scikit-learn is installed.')
            use_balltree = False

    if xhs.size == 0 or rr.size == 0:
        if return_dists:
            return np.array([], int), np.array([])
        return np.array([], int)
    if use_balltree is True:
        ball_tree = BallTree(xhs)
        if return_dists:
            out = ball_tree.query(rr, k=1, return_distance=True)
            return out[1][:, 0], out[0][:, 0]
        else:
            nearest = ball_tree.query(rr, k=1, return_distance=False)[:, 0]
            return nearest
    else:
        from scipy.spatial.distance import cdist
        if return_dists:
            nearest = list()
            dists = list()
            for r in rr:
                d = cdist(r[np.newaxis, :], xhs)
                idx = np.argmin(d)
                nearest.append(idx)
                dists.append(d[0, idx])
            return (np.array(nearest), np.array(dists))
        else:
            nearest = np.array([np.argmin(cdist(r[np.newaxis, :], xhs))
                                for r in rr])
            return nearest
Beispiel #3
0
    def knn(a, b):
        "k nearest neighbors"
        b = np.array([bb[:-1] for bb in b])
        tree = BallTree(b)
        __, indx = tree.query(a[:-1], k)

        return [b[i] for i in indx]
Beispiel #4
0
def compute_labels(X, C):
    """Compute the cluster labels for dataset X given centers C.
    """
    # labels = np.argmin(pairwise_distances(C, X), axis=0) # THIS REQUIRES TOO MUCH MEMORY FOR LARGE X
    tree = BallTree(C)
    labels = tree.query(X, k=1, return_distance=False).squeeze()
    return labels
Beispiel #5
0
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
                            metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2  # Unused, but needs to be integer; assume euclidean

    dim = X.shape[0]
    min_samples = min(dim - 1, min_samples)

    tree = BallTree(X, metric=metric, leaf_size=leaf_size)

    dist_metric = DistanceMetric.get_metric(metric)

    core_distances = tree.query(X, k=min_samples,
                                dualtree=True,
                                breadth_first=True)[0][:, -1]
    min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha)
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]

    single_linkage_tree = label(min_spanning_tree)

    return single_linkage_tree, None
Beispiel #6
0
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
                            metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2  # Unused, but needs to be integer; assume euclidean

    size = X.shape[0]
    min_samples = min(size - 1, min_samples)

    tree = BallTree(X, metric=metric, leaf_size=leaf_size)

    dist_metric = DistanceMetric.get_metric(metric)

    #Get distance to kth nearest neighbour
    core_distances = tree.query(X, k=min_samples,
                                dualtree=True,
                                breadth_first=True)[0][:, -1]

    #Mutual reachability distance is implicite in mst_linkage_core_cdist
    min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha)
    #Sort edges of the min_spanning_tree by weight
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
    #Convert edge list into standard hierarchical clustering format
    single_linkage_tree = label(min_spanning_tree)

    return single_linkage_tree, None
Beispiel #7
0
def DualTree(dataFlux, dDataFlux, modelFlux, modelParams, mcIts):
    """
    Inputs:
            dataFlux = observed fluxes, array of size (#objects,#filters)
            dDataFlux = flux uncertainties, array of size (#objects,#filters)
            modelFlux = fluxes of models, array of size (#models,#filters)
            modelParams = parameters of each model to be recorded, array of size (#models,#parameters)
            mcIts = number of times to perturb fluxes for each object, int
            
    Output:
            NumPy array of size (#objects,mcIts,#params)
            e.g. the zeroth element gives you a 2d array where each row represents the
            fit parameters from one monte carlo iteration 
    """
    modelColors = modelFlux[:, 1:] / modelFlux[:, :-1]
    tree = BallTree(modelColors)
    fitParams = []
    for i in range(len(dataFlux)):
        newFlux = dataFlux[i] + dDataFlux[i] * np.random.randn(mcIts, len(dataFlux[i]))
        newColors = newFlux[:, 1:] / newFlux[:, :-1]
        query = tree.query(newColors, k=1, dualtree=True)
        s = Scale(modelFlux[query[1][:, 0]], newFlux, np.ones(np.shape(newFlux)))
        myParams = s
        for j in range(len(modelParams[0])):
            myParams = np.c_[myParams, modelParams[query[1][:, 0]][:, j]]
        fitParams.append(myParams)
    return np.array(fitParams)
Beispiel #8
0
class BallTreeRecommender(object):
    """
    Given input terms, provide k recipe recommendations
    """
    def __init__(self, k=3, **kwargs):
        self.k = k
        self.trans_path = "svd.pkl"
        self.tree_path = "tree.pkl"
        self.transformer = False
        self.tree = None
        self.load()

    def load(self):
        """
        Load a pickled transformer and tree from disk,
        if they exist.
        """
        if os.path.exists(self.trans_path):
            self.transformer = joblib.load(open(self.trans_path, 'rb'))
            self.tree = joblib.load(open(self.tree_path, 'rb'))
        else:
            self.transformer = False
            self.tree = None

    def save(self):
        """
        It takes a long time to fit, so just do it once!
        """
        joblib.dump(self.transformer, open(self.trans_path, 'wb'))
        joblib.dump(self.tree, open(self.tree_path, 'wb'))

    def fit_transform(self, documents):
        # Transformer will be False if pipeline hasn't been fit yet,
        # Trigger fit_transform and save the transformer and lexicon.
        if self.transformer == False:
            self.transformer = Pipeline([
                ('norm', TextNormalizer(minimum=50, maximum=200)),
                ('transform', Pipeline([
                    ('tfidf', TfidfVectorizer()),
                    ('svd', TruncatedSVD(n_components=200))
                ])
                 )
            ])
            self.lexicon = self.transformer.fit_transform(documents)
            self.tree = BallTree(self.lexicon)
            self.save()

    def query(self, terms):
        """
        Given input list of ingredient terms,
        return the k closest matching recipes.

        :param terms: list of strings
        :return: list of document indices of documents
        """
        vect_doc = self.transformer.named_steps['transform'].fit_transform(
            wordpunct_tokenize(terms)
        )
        dists, inds = self.tree.query(vect_doc, k=self.k)
        return inds[0]
def correrPruebaLocal(set_ampliado):
	
	print "corriendo prueba local"

	train,targetTrain,test,targetTest = cargarDatosPruebaLocal(set_ampliado,0.66)

	tree = BallTree(train,leaf_size=30) 
	predictions=[]
	correctas=0
	incorrectas=0
	for x in range(len(test)):
		dist, ind = tree.query(test[x], k=4)
		resultado = obtenerPrediccionknnEB(train,targetTrain,test[x],ind.ravel())
		predictions.append(resultado)
		print progreso(x,len(test))	
		if resultado==targetTest[x]: 
			correctas+=1
		else:
			incorrectas+=1
		print "Predicciones -->  Correctas: " + str(correctas) + "Incorrectas: " + str(incorrectas)+ "Total: "+ str(len(test))
		print('> predicted=' + repr(resultado) + ', actual=' + repr(targetTest[x]) + ' ' + progreso(x,len(test)) )
	print "precision total"
	correct = 0
	for x in range(len(test)):
		if targetTest[x] == predictions[x]:
			correct += 1
	print (float(correct)/float(len(test))) * 100.0
Beispiel #10
0
def DualTree(dataFlux,dDataFlux,modelFlux,modelParams,mcIts,columnsToScale=[]):
    '''
    Inputs:
            dataFlux = observed fluxes, array of size (#objects,#filters)
            dDataFlux = flux uncertainties, array of size (#objects,#filters)
            modelFlux = fluxes of models, array of size (#models,#filters)
            modelParams = parameters of each model to be recorded, array of size (#models,#parameters)
            mcIts = number of times to perturb fluxes for each object, int
            columnsToScale = list of column indices in modelParams of parameters that need to be multiplied by scale factor
            
    Output:
            NumPy array of size (#objects,mcIts,#params)
            e.g. the zeroth element gives you a 2d array where each row represents the
            fit parameters from one monte carlo iteration 
    '''
    modelColors = modelFlux[:,1:] / modelFlux[:,:-1]
    tree = BallTree(modelColors)
    fitParams = []
    for i in range(len(dataFlux)):
        newFlux = dataFlux[i] + dDataFlux[i] * np.random.randn(mcIts,len(dataFlux[i]))
        newColors = newFlux[:,1:] / newFlux[:,:-1]
        query = tree.query(newColors,k=1,dualtree=True)
        s = fit_tools.Scale(modelFlux[query[1][:,0]],newFlux,np.ones(np.shape(newFlux)))
        myParams = s
        for j in range(len(modelParams[0])):
            if j in columnsToScale:
                myParams = np.c_[myParams,np.multiply(s,modelParams[query[1][:,0]][:,j])]                
            else:
                myParams = np.c_[myParams,modelParams[query[1][:,0]][:,j]]
        fitParams.append(myParams)
    return(np.array(fitParams))
def test_barnes_hut_angle():
    # When Barnes-Hut's angle=0 this corresponds to the exact method.
    angle = 0.0
    perplexity = 10
    n_samples = 100
    for n_components in [2, 3]:
        n_features = 5
        degrees_of_freedom = float(n_components - 1.0)

        random_state = check_random_state(0)
        distances = random_state.randn(n_samples, n_features)
        distances = distances.astype(np.float32)
        distances = distances.dot(distances.T)
        np.fill_diagonal(distances, 0.0)
        params = random_state.randn(n_samples, n_components)
        P = _joint_probabilities(distances, perplexity, False)
        kl, gradex = _kl_divergence(params, P, degrees_of_freedom, n_samples,
                                    n_components)

        k = n_samples - 1
        bt = BallTree(distances)
        distances_nn, neighbors_nn = bt.query(distances, k=k + 1)
        neighbors_nn = neighbors_nn[:, 1:]
        Pbh = _joint_probabilities_nn(distances, neighbors_nn,
                                      perplexity, False)
        kl, gradbh = _kl_divergence_bh(params, Pbh, neighbors_nn,
                                       degrees_of_freedom, n_samples,
                                       n_components, angle=angle,
                                       skip_num_points=0, verbose=False)
        assert_array_almost_equal(Pbh, P, decimal=5)
        assert_array_almost_equal(gradex, gradbh, decimal=5)
Beispiel #12
0
def md_nearest_from_centroids(seeding, centroids):
    # mean distance
    ball_tree = BallTree(seeding)
    dist, idx = ball_tree.query(centroids)
    sum_dist = sum(d[0] for d in dist)
    mean = sum_dist / len(centroids)
    return mean
Beispiel #13
0
class BallTreeANN:
    def __init__(self):
        """
        Constructor
        """
        self.nbrs = None

    def build_index(self, dataset, leaf_size):
        self.nbrs = BallTree(dataset, leaf_size=leaf_size, metric="euclidean")
        return self.nbrs

    def build_store_index(self, dataset, path, leaf_size):
        self.build_index(dataset, leaf_size)
        self.store_index(path)

    def store_index(self, path):
        with open(path, "wb") as output1:
            pickle.dump(self.nbrs, output1, pickle.HIGHEST_PROTOCOL)

    def load_index(self, path):
        with open(path, "rb") as input1:
            self.nbrs = pickle.load(input1)

    def search_in_radious(self, vector, radious=2):
        distances, indices = self.nbrs.query_radius(vector, r=radious, return_distance=True)
        return distances, indices

    def search_neighbors(self, vector, num_neighbors):
        distances, indices = self.nbrs.query(vector, k=num_neighbors)
        return distances, indices
def _rsl_prims_balltree(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2):

    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2 # Unused, but needs to be integer; assume euclidean

    dim = X.shape[0]
    k = min(dim - 1, k)

    tree = BallTree(X, metric=metric)

    dist_metric = DistanceMetric.get_metric(metric)

    core_distances = tree.query(X, k=k)[0][:,-1]
    min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric)

    single_linkage_tree = label(min_spanning_tree)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree
Beispiel #15
0
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
                            metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False, **kwargs):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2  # Unused, but needs to be integer; assume euclidean

    # The Cython routines used require contiguous arrays
    if not X.flags['C_CONTIGUOUS']:
        X = np.array(X, dtype=np.double, order='C')

    size = X.shape[0]
    min_samples = min(size - 1, min_samples)

    tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)

    dist_metric = DistanceMetric.get_metric(metric, **kwargs)

    # Get distance to kth nearest neighbour
    core_distances = tree.query(X, k=min_samples,
                                dualtree=True,
                                breadth_first=True)[0][:, -1].copy(order='C')

    # Mutual reachability distance is implicit in mst_linkage_core_vector
    min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha)
    # Sort edges of the min_spanning_tree by weight
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
    # Convert edge list into standard hierarchical clustering format
    single_linkage_tree = label(min_spanning_tree)

    return single_linkage_tree, None
Beispiel #16
0
class ColorSchemeTransformer(object):
    def __init__(self, color_palette):
        self.color_mapper = BallTree(color_palette)

    def nearest_neighbors(self, image):
        flat_image = image.reshape((image.shape[0] * image.shape[1], 3)) 
        matched_colors = self.color_mapper.query(flat_image)[1]
        return matched_colors.reshape((image.shape[0], image.shape[1]))
Beispiel #17
0
class HammingBallTree(HammingSearchBase):
    def __init__(self, leaf_size=40, query_kwds=None):
        self.leaf_size = leaf_size
        self.query_kwds = query_kwds or {}

    def fit(self, X):
        X = self._validate_input(X, return_compact=False)
        self._tree = BallTree(X, metric='hamming', leaf_size=self.leaf_size)
        return self

    def query(self, X, k, return_dist=False):
        X = self._validate_input(X, return_compact=False)

        if return_dist:
            dist, ind = self._tree.query(X, k, return_distance=True)
            return ind, (dist * X.shape[1]).astype(int)
        else:
            return self._tree.query(X, k, return_distance=False)
Beispiel #18
0
def rmsd_nearest_from_centroids(seeding, centroids):
    # root mean squared distance from each centroids to its closest seeding
    ball_tree = BallTree(seeding)
    dist, idx = ball_tree.query(centroids)

    # root mean squared distance
    sum_sqdist = sum(d[0] ** 2 for d in dist)
    mean = sum_sqdist / len(centroids)
    return mean ** 0.5
Beispiel #19
0
    def predict(self, X):
        ball_tree = BallTree()
        ball_tree.fit(self.cluster_centers_)

        _, indexes = ball_tree.query(X)
        result = []
        for idx, in indexes:
            result.append(self.labels_[idx])

        return result
Beispiel #20
0
def md_weighted_nearest_from_centroids(seeding, centroids, weights):
    assert len(centroids) == len(weights)

    sum_weight = sum(weights)

    ball_tree = BallTree(seeding)
    dist, idx = ball_tree.query(centroids)
    sum_weighted_dist = sum(d[0] * weight for d, weight in zip(dist, weights))
    mean = sum_weighted_dist / sum_weight
    return mean
Beispiel #21
0
def get_centroid_weights(X, centroids):
    assert isinstance(X, np.ndarray)
    assert isinstance(centroids, np.ndarray)

    ball_tree = BallTree(centroids)
    dist, indexes = ball_tree.query(X)
    weights = [0 for i in centroids]
    for idx in indexes:
        weights[idx] += 1

    return weights
def get_graph_topo(halos):

    x,y,z = cosmology.spherical_to_cartesian_with_redshift(halos['ra'],halos['dec'],halos['z'])
    box_coords = np.concatenate( [x,y,z] , axis=1)
    BT = BallTree(box_coords, leaf_size=5)
    list_conn = []
    for ih,vh in enumerate(halos):
       
        n_connections=70
        bt_dx,bt_id = BT.query(box_coords[ih,:],k=n_connections)

        for ic,vc in enumerate(halos[bt_id]):

            pass
def correrPruebaParaKaggle(set_ampliado):
	print "corriendo pruebas para kaggle"

	train,targetTrain,test = cargarDatosParaKaggle(set_ampliado)


	tree = BallTree(train,leaf_size=30) 
	predictions=[]
	for x in range(len(test)):
		dist, ind = tree.query(test[x], k=4)
		resultado = obtenerPrediccionknnEB(train,targetTrain,test[x],ind.ravel())
		predictions.append(resultado)
		print progreso(x,len(test))

	guardarPrediccionesParaKaggle(predictions)
Beispiel #24
0
def build_knn_matrix(data_matrix):
	neighbours_matrix = np.zeros((voxel_num,K_NN-1))
	tree = BallTree(data_matrix[:,0:3])
	for voxel in range(voxel_num):
		dist,ind = tree.query(data_matrix[voxel,0:3],k = K_NN)
		neighbours_matrix[voxel,:] = ind[0,1:]

	for cur_voxel in range(voxel_num):
		neighbours = neighbours_matrix[cur_voxel,:]
		for ind in range(len(neighbours)):
			neighbour = int(neighbours[ind])
			if(cur_voxel not in neighbours_matrix[neighbour,:]):
				neighbours_matrix[cur_voxel,ind] = -1

	return neighbours_matrix
def get_evidence_grid(points, res_pts, intr_prms, exact=False):
    """
    Associate the "z-axis" value (evidence, overlap, etc...) res_pts with its
    corresponding point in the template bank (points). If exact is True, then
    the poit must exactly match the point in the bank.
    """
    grid_tree = BallTree(selected)
    grid_idx = []
    # Reorder the grid points to match their weight indices
    for res in res_pts:
        dist, idx = grid_tree.query(res, k=1)
        # Stupid floating point inexactitude...
        #print res, selected[idx[0][0]]
        #assert numpy.allclose(res, selected[idx[0][0]])
        grid_idx.append(idx[0][0])
    return points[grid_idx]
Beispiel #26
0
def calc_vert_vals(verts, pts, vals, method='max', k_points=100):
    ball_tree = BallTree(pts)
    dists, pts_inds = ball_tree.query(verts, k=k_points, return_distance=True)
    near_vals = vals[pts_inds]
    # sig_dists = dists[np.where(abs(near_vals)>2)]
    cover = len(np.unique(pts_inds.ravel()))/float(len(pts))
    print('{}% of the points are covered'.format(cover*100))
    if method=='dist':
        n_dists = 1/(dists**2)
        norm = 1/np.sum(n_dists, 1)
        norm = np.reshape(norm, (len(norm), 1))
        n_dists = norm * n_dists
        verts_vals = np.sum(near_vals * n_dists, 1)
    elif method=='max':
        verts_vals = near_vals[range(near_vals.shape[0]), np.argmax(abs(near_vals), 1)]
    return verts_vals
Beispiel #27
0
def voronoid_filling(seeding, centroids, weights):
    assert len(centroids) == len(weights)

    ball_tree = BallTree(centroids)
    _, indexes = ball_tree.query(seeding)
    filled_centroids = set()

    sum_weights = sum(weights)
    badness = sum_weights
    for idx, in indexes:
        if idx in filled_centroids:
            continue

        filled_centroids.add(idx)
        badness -= weights[idx]

    return badness / sum_weights
def cvt_bins(rkpc,zkpc,flux,fluxerror,bin_ids,niters = 4,bad_bin_id = None):

    combrzarr = (np.vstack((rkpc,zkpc))).T
    weights = (flux/fluxerror)**2
    weighted_rkpc = rkpc*weights
    weighted_zkpc = zkpc*weights
    old_rcentroids = None
    old_zcentroids = None
    unique_bins = np.unique(bin_ids)
    if bad_bin_id != None:
        unique_bins = unique_bins[unique_bins != bad_bin_id]
    max_distance = -1
    for i in range(niters):
        start = time.time()
        #Compute centroids:
        meanweights = scipy.ndimage.mean(weights,labels=bin_ids,index=unique_bins)
        numperbin = scipy.ndimage.sum(np.ones(len(weights)),labels=bin_ids,index=unique_bins)
        mean_weighted_rkpc = scipy.ndimage.mean(weighted_rkpc,labels=bin_ids,index=unique_bins)
        mean_weighted_zkpc = scipy.ndimage.mean(weighted_zkpc,labels=bin_ids,index=unique_bins)
        #print numperbin.min(),numperbin.max()
        #np.savetxt('test_weights.txt',zip(meanweights,numperbin))

        #print "Debug: ",meanweights.min(),meanweights.max()

        rcentroids = (mean_weighted_rkpc/meanweights)[numperbin > 0]
        zcentroids = (mean_weighted_zkpc/meanweights)[numperbin > 0]

        #Compute which bin each pixel belongs in:
        combrzarr_centroids = (np.vstack((rcentroids,zcentroids))).T
        tree = BallTree(combrzarr_centroids,leaf_size=10,metric='euclidean')
        dists,indices = tree.query(combrzarr,k=1)
        indices = indices[:,0]
        bin_ids = unique_bins[indices]
        print "CVT iteration {0:d}: Number of bins = {1:d}, Time = {2:.2f}s".format(i,len(rcentroids),time.time()-start)

        # if old_rcentroids != None:
        #     old_rcentroids = old_rcentroids[numperbin > 0]
        #     old_zcentroids = old_zcentroids[numperbin > 0]
        #     distances = np.sqrt((rcentroids-old_rcentroids)**2+(zcentroids-old_zcentroids)**2)
        #     max_distance = distances.max()
        #     print "CVT iteration {0:d}: Number of bins = {1:d}, Max Centroid Change = {2:.3e} kpc, Time = {3:.2f}s".format(i,len(rcentroids),max_distance,time.time()-start)
        # old_rcentroids = rcentroids
        # old_zcentroids = zcentroids

    return bin_ids,max_distance
def fix_bins(rkpc,zkpc,bin_ids):
    #Get the centroids of all the successful bins:
    good_bins = np.unique(bin_ids[bin_ids >= 0])
    bad_rkpc = rkpc[bin_ids < 0]
    bad_zkpc = zkpc[bin_ids < 0]
    comb_badrzarr = (np.vstack((bad_rkpc,bad_zkpc))).T

    rcentroid = scipy.ndimage.mean(rkpc,labels=bin_ids,index=good_bins)
    zcentroid = scipy.ndimage.mean(zkpc,labels=bin_ids,index=good_bins)

    #Ball-tree the centroids:
    combrzarr = (np.vstack((rcentroid,zcentroid))).T
    tree = BallTree(combrzarr,leaf_size=5,metric='euclidean')

    #Query all the bad pixels for their nearest centroid:
    dists,indices = tree.query(comb_badrzarr,k=1)
    indices = indices[:,0]
    bin_ids[bin_ids < 0] = good_bins[indices]
    return bin_ids
Beispiel #30
0
def get_nearest(src_points, candidates, k_neighbors=1):
    # https://autogis-site.readthedocs.io/en/latest/notebooks/L3/06_nearest-neighbor-faster.html
    """Find nearest neighbors for all source points from a set of candidate points"""

    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15, metric='haversine')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    # Get closest indices and distances (i.e. array at index 0)
    # note: for the second closest points, you would take index 1, etc.
    closest = indices[0]
    closest_dist = distances[0]

    # Return indices and distances
    return (closest, closest_dist)
Beispiel #31
0
def do_novelty_generation(
        num_cpu=multiprocessing.cpu_count(), min_distance=1.):
    gateway.nextGeneration()
    chromosomes = gateway.listNewChromosomes()
    machines = [gateway.executeProgram(c.getProgram()) for c in chromosomes]
    _floats = [list(m.floats().toList()) for m in machines]
    behaviors = Parallel(n_jobs=num_cpu)(delayed(evaluate_behavior)(_f)
                                         for _f in _floats)
    ball = BallTree(behaviors_archive, metric='euclidean')
    _indexes = []
    for i, candidate in enumerate(behaviors):
        dist, _ = ball.query([candidate], k=1)
        if dist[0][0] > min_distance:
            behaviors_archive.append(candidate.tolist())
            chromosomes[i].setFitness(dist[0][0])
            _indexes.append(i)
            score = evaluate_fitness(_floats[i])
            if len(history) < 1 or history[-1][0] < score:
                history.append((score, chromosomes[i].toCodeString()))
                print(history[-1])
    print("Added {} new behaviors.".format(len(_indexes)))
def get_nearest_neighbors(src_vectors, tgt_vectors, num_neighbors=5):

    if not BallTree:
        raise ImportError('The scikit-learn package must be installed')

    src_latlng = np.radians(np.array(src_vectors))
    tgt_latlng = np.radians(np.array(tgt_vectors))

    tree = BallTree(tgt_latlng, metric='haversine')
    di_tuple = tree.query(src_latlng, k=num_neighbors)

    ix_dist_array = []
    for i in range(len(src_vectors)):
        nn_index_and_dist = []
        for j in range(num_neighbors):
            index = di_tuple[1][i][j]
            distance = di_tuple[0][i][j] * 6367 * 1000
            nn_index_and_dist.append((index, distance))
        ix_dist_array.append(nn_index_and_dist)

    return ix_dist_array
def get_color_and_labels(
        original_vertices: np.ndarray,
        representative_vertices: np.ndarray) -> List[np.ndarray]:
    """find nearest neighbor in Euclidean space to interpolate color and label information to vertices in simplified mesh.

    Arguments:
        original_vertices {np.ndarray} -- vertex positions in original mesh
        representative_vertices {np.ndarray} -- vertex positions in simplified mesh

    Returns:
        List[np.ndarray] -- list of arrays containing RGB color and label information
    """
    ball_tree = BallTree(original_vertices[:, :3])

    return_colors_labels = []

    for coords in representative_vertices:
        _, ind = ball_tree.query(coords, k=1)
        return_colors_labels.append(original_vertices[ind.flatten()][:, 3:])

    return return_colors_labels
Beispiel #34
0
class NeighborSampler(BaseEstimator):
    def __init__(self, k=8, temperature=1.2):
        self.k = k
        self.temperature = temperature

    def fit(self, X, y):
        self.tree_ = BallTree(X)
        self.y_ = np.array(y)

    def predict(self, X, random_state=None):
        distances, indeces = self.tree_.query(X,
                                              return_distance=True,
                                              k=self.k)
        result = []
        dist = []
        for distance, index in zip(distances, indeces):
            result.append(
                np.random.choice(index,
                                 p=softmax(distance * self.temperature)))
            dist.append(distance)

        return self.y_[result], dist
Beispiel #35
0
def make_nearest_surf(center,
                      radius,
                      rotation,
                      contour_pts,
                      psize=20,
                      qsize=8,
                      vis=False,
                      seg=None):

    points = np.array([[
        radius[0] * math.cos(u) * math.cos(v),
        radius[1] * math.cos(v) * math.sin(u), radius[2] * math.sin(v)
    ] for u in np.linspace(0, 2 * math.pi, num=psize) for v in np.linspace(
        -math.pi / 2 + 0.01, math.pi / 2 - 0.01, num=psize)])
    for i in range(len(points)):
        points[i] = np.dot(points[i], rotation)
    points += center

    tree = BallTree(contour_pts)
    _, ind = tree.query(points, k=1)
    ind = np.reshape(ind, (ind.shape[0]))

    points = contour_pts[ind, :].astype(np.float64)
    noise = 0.001
    points += np.random.rand(points.shape[0], points.shape[1]) * noise

    if vis:
        img_mask = get_image_mask_points(seg, points)

        color_img = draw_segmentation(seg, img_mask, mark_val=255)
        show_ct_image(color_img)

    return approximate_surface(points.tolist(),
                               psize,
                               psize,
                               3,
                               3,
                               ctrlpts_size_u=qsize,
                               ctrlpts_size_v=qsize)
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='euclidean',
                        **kwargs):

    # The Cython routines used require contiguous arrays
    if not X.flags['C_CONTIGUOUS']:
        X = np.array(X, dtype=np.double, order='C')

    dim = X.shape[0]
    k = min(dim - 1, k)

    tree = BallTree(X, metric=metric, **kwargs)

    dist_metric = DistanceMetric.get_metric(metric, **kwargs)

    core_distances = tree.query(X, k=k)[0][:, -1].copy(order='C')
    min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric,
                                                alpha)

    single_linkage_tree = label(min_spanning_tree)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    return single_linkage_tree
Beispiel #37
0
 def train(self):
     R = (self.m**2 + self.n**2)**0.5
     for it in range(self.max_iter):
         for sample in self.data:
             win_m, win_n, max_sim = self.find_winner(sample)
             neighbor = self.find_neighbor(win_m, win_n, R)
             for w in neighbor:
                 mw = w[0]
                 nw = w[1]
                 rw = w[2]
                 self.weights[mw, nw] += self.learning_rate(
                     it, rw) * (sample - self.weights[mw, nw])
             R *= 1 - (it + 1) / self.max_iter
     data_tree = BallTree(self.data)
     for mi in range(self.m):
         for ni in range(self.n):
             dist, idx = data_tree.query([self.weights[mi, ni]], k=10)
             vote = [self.labels[i] for i in idx.reshape(-1)]
             self.output[mi, ni] = int(
                 sorted(dict(Counter(vote)).items(),
                        key=lambda d: d[1],
                        reverse=True)[0][0])
Beispiel #38
0
def test_index():
    xs = rand(1000, 100, random_state=42).toarray()

    try:
        indexer = SQLiteIndexer(index_path=INDEX_PATH)
        index = PrioritizedDynamicContinuousIndex(indexer,
                                                  composite_indices=2,
                                                  simple_indices=50)
        index.fit(xs)

        x = xs[0:1]
        k = 10

        nn_baseline = BallTree(xs)

        baseline_dist, baseline_idx = nn_baseline.query(x, k=k)
        dist, idx = index.query(x, k=k)

        # np.testing.assert_equal(baseline_idx[0], idx)
    finally:
        if os.path.exists(INDEX_PATH):
            os.remove(INDEX_PATH)
Beispiel #39
0
def get_nearest(src_points, candidates, k_neighbors=1):
    """Find nearest neighbors for all source points from a set of candidate points"""

    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15, metric='haversine')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    # Get closest indices and distances (i.e. array at index 0)
    # note: for the second closest points, you would take index 1, etc.
    #    closest = indices[0]
    #    closest_dist = distances[0]
    closest = indices
    closest_dist = distances

    # Return indices and distances
    return (closest, closest_dist)
Beispiel #40
0
def nearestNeighbors():
    print('Nearest Neighbors')
    start_time = time.time()

    reader = csv.DictReader(open(data_dir + 'train-tuned-disc.csv'))
    fieldNames = reader.fieldnames

    allData = []

    testReader = csv.DictReader(open(data_dir + 'ttd-1.csv'))
    testFieldNames = testReader.fieldnames

    testData = []

    for row in reader:
        rowData = []
        for field in testFieldNames:
            if field in knn_disabled:
                continue
            else:
                rowData.append(row.get(field))
        allData.append(rowData)

    for row in testReader:
        rowData = []
        for field in testFieldNames:
            if field in knn_disabled:
                continue
            else:
                rowData.append(row.get(field))
        testData.append(rowData)

    X = numpy.array(allData)
    Y = numpy.array(testData)
    nn = BallTree(X, leaf_size=30, metric='euclidean')
    result = nn.query(Y, k=50, return_distance=False)
    print(result)

    print('Finished discretization in {0} s'.format(time.time() - start_time))
Beispiel #41
0
    def get_nearest(src_points, candidates, k_neighbors=1):
        # get_nearest and nearest_neighbor functions sourced from the following site:
        # https://automating-gis-processes.github.io/site/notebooks/L3/nearest-neighbor-faster.html
        """Find nearest neighbors for all source points from a set of candidate points"""
        print('balltree get nearest function - hsl')
        # Create tree from the candidate points
        tree = BallTree(candidates, leaf_size=15, metric='haversine')

        # Find closest points and distances
        distances, indices = tree.query(src_points, k=k_neighbors)

        # Transpose to get distances and indices into arrays
        distances = distances.transpose()
        indices = indices.transpose()

        # Get closest indices and distances (i.e. array at index 0)
        # note: for the second closest points, you would take index 1, etc.
        closest = indices[0]
        closest_dist = distances[0]

        # Return indices and distances
        return (closest, closest_dist)
Beispiel #42
0
def calc_nearest_site():
    # Now we are going to use sklearn's KDTree to find the nearest neighbor of
    # each center for the nearest port.
    points_of_int = np.radians(
        df_centers.loc[:, ['average_lat', 'average_lon']].values)
    candidates = np.radians(ports_wpi.loc[:, ['lat', 'lon']].values)
    tree = BallTree(candidates, leaf_size=30, metric='haversine')
    ports_wpi = get_sites(engine)
    nearest_list = []
    for i in range(len((points_of_int))):
        dist, ind = tree.query(points_of_int[i, :].reshape(1, -1), k=1)
        nearest_dict = {
            clust_id_value: df_centers.iloc[i].loc[clust_id_value],
            'nearest_site_id': ports_wpi.iloc[ind[0][0]].loc['port_id'],
            'nearest_port_dist': dist[0][0] * 6371.0088
        }
        nearest_list.append(nearest_dict)
    df_nearest = pd.DataFrame(nearest_list)
    df_centers = pd.merge(df_centers,
                          df_nearest,
                          how='left',
                          on=clust_id_value)
Beispiel #43
0
def nne(dim_red, true_labels):
    """
    Calculates the nearest neighbor accuracy (basically leave-one-out cross
    validation with a 1NN classifier).

    Args:
        dim_red (array): dimensions (k, cells)
        true_labels (array): 1d array of integers

    Returns:
        Nearest neighbor accuracy - fraction of points for which the 1NN
        1NN classifier returns the correct value.
    """
    # use sklearn's BallTree
    bt = BallTree(dim_red.T)
    correct = 0
    for i, l in enumerate(true_labels):
        dist, ind = bt.query([dim_red[:, i]], k=2)
        closest_cell = ind[0, 1]
        if true_labels[closest_cell] == l:
            correct += 1
    return float(correct) / len(true_labels)
Beispiel #44
0
class PPMD:
    def __init__(self, features, labels, k=5):
        self.features = features
        self._kdtree = BallTree(features)
        self._y = labels
        self._k = k

    def majority(self, label_indices):
        assert len(label_indices) == self._k, "Did not get k inputs"

        # Get the labels of the k nearest neighbors
        knn_labels = []
        for label_index in label_indices:
            knn_labels.append(self._y[label_index])

        # Return the median of the next label
        median_vector = []
        for i in range(len(knn_labels[0])):
            median_vector.append(np.median([row[i] for row in knn_labels]))

        return median_vector

    def classify(self, feature_set):
        # ind = self._kdtree.query_radius(feature_set.reshape(1, -1), r=1)
        dist, ind = self._kdtree.query(feature_set.reshape(1, -1), k=self._k)

        return self.majority(ind[0])

    def error(self, prediction, truth, features):

        assert len(prediction) == len(
            truth
        ), "Number of predictions must equal the number of real data points."

        error = np.abs(np.array(prediction) - np.array(truth))
        random_walk = np.abs(np.diff(features, axis=0)).sum()

        return np.array(error / ((len(prediction) /
                                  (len(features) - 1.0)) * random_walk)).sum()
Beispiel #45
0
def image_retrieval():
    topK = 10
    avg_acc = 0

    x_train_noisy, x_test_noisy, y_train, y_test, x_train, x_test = preprocess(
    )
    autoencoder = load_model('../working/autoencoder.h5')
    print(autoencoder.summary())
    encoder = Model(autoencoder.input,
                    autoencoder.get_layer('encoding_layer').output)

    coded_train = encoder.predict(x_train_noisy)
    coded_train = coded_train.reshape(
        coded_train.shape[0],
        coded_train.shape[1] * coded_train.shape[2] * coded_train.shape[3])
    coded_train = preprocessing.normalize(coded_train, norm='l2')

    tree = BallTree(coded_train, leaf_size=200)

    #extracting features from test set
    coded_test = encoder.predict(x_test_noisy)
    coded_test = coded_test.reshape(
        coded_test.shape[0],
        coded_test.shape[1] * coded_test.shape[2] * coded_test.shape[3])
    coded_test = preprocessing.normalize(coded_test, norm='l2')

    for i in range(coded_test.shape[0]):
        query_code = coded_test[i]
        query_label = y_test[i]
        dists, ids = tree.query([query_code], k=topK)
        labels = np.array([y_train[id] for id in ids[0]])

        acc = (labels == query_label).astype(int).sum() / topK
        avg_acc += acc
        if i % 1000 == 0:
            print('{} / {}: {}'.format(i, coded_test.shape[0], acc))
    avg_acc /= coded_test.shape[0]
    print("The average top K accuracy is: {}".format(avg_acc))
Beispiel #46
0
    def spatial_lda_internal(adata_subset, x_coordinate, y_coordinate,
                             phenotype, method, radius, knn, imageid):

        # Print which image is being processed
        print('Processing: ' + str(np.unique(adata_subset.obs[imageid])))

        # Create a DataFrame with the necessary inforamtion
        data = pd.DataFrame({
            'x': adata_subset.obs[x_coordinate],
            'y': adata_subset.obs[y_coordinate],
            'phenotype': adata_subset.obs[phenotype]
        })

        # Identify neighbourhoods based on the method used
        # a) KNN method
        if method == 'knn':
            print("Identifying the " + str(knn) +
                  " nearest neighbours for every cell")
            tree = BallTree(data[['x', 'y']], leaf_size=2)
            ind = tree.query(data[['x', 'y']], k=knn, return_distance=False)

        # b) Local radius method
        if method == 'radius':
            print("Identifying neighbours within " + str(radius) +
                  " pixels of every cell")
            kdt = BallTree(data[['x', 'y']], leaf_size=2)
            ind = kdt.query_radius(data[['x', 'y']],
                                   r=radius,
                                   return_distance=False)

        # Map phenotype
        phenomap = dict(zip(list(range(len(ind))),
                            data['phenotype']))  # Used for mapping
        for i in range(len(ind)):
            ind[i] = [phenomap[letter] for letter in ind[i]]

        # return
        return ind
Beispiel #47
0
    def nn_search(self,
                  tree_features,
                  query_features,
                  metric='haversine',
                  convert_radians=False):
        '''
        Build a BallTree for nearest neighbor search based on haversine distance.

        Parameters
        ----------

        tree_features: array_like
                       Input features to create the search tree. Features are in
                       lat, lon format, in radians

        query_features: array_like
                        Points to which calculate the nearest neighbor within the tree.
                        latlon coordinates expected in radians for distance calculation

        metric: str
                Distance metric for neighorhood search. Default haversine for latlon coordinates.

        convert_radians: bool
                         Flag in case features are not in radians and need to be converted

        Returns
        -------

        distances: array_like
                   Array with the corresponding distance in km (haversine distance * earth radius)

        '''

        if convert_radians:
            pass

        tree = BallTree(tree_features, metric=metric)
        return tree.query(query_features)[0] * 6371000 / 1000
Beispiel #48
0
def test_barnes_hut_angle():
    # When Barnes-Hut's angle=0 this corresponds to the exact method.
    angle = 0.0
    perplexity = 10
    n_samples = 100
    for n_components in [2, 3]:
        n_features = 5
        degrees_of_freedom = float(n_components - 1.0)

        random_state = check_random_state(0)
        distances = random_state.randn(n_samples, n_features)
        distances = distances.astype(np.float32)
        distances = abs(distances.dot(distances.T))
        np.fill_diagonal(distances, 0.0)
        params = random_state.randn(n_samples, n_components)
        P = _joint_probabilities(distances, perplexity, verbose=0)
        kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom,
                                              n_samples, n_components)

        k = n_samples - 1
        bt = BallTree(distances)
        distances_nn, neighbors_nn = bt.query(distances, k=k + 1)
        neighbors_nn = neighbors_nn[:, 1:]
        distances_nn = np.array([distances[i, neighbors_nn[i]]
                                 for i in range(n_samples)])
        assert np.all(distances[0, neighbors_nn[0]] == distances_nn[0]),\
            abs(distances[0, neighbors_nn[0]] - distances_nn[0])
        P_bh = _joint_probabilities_nn(distances_nn, neighbors_nn,
                                       perplexity, verbose=0)
        kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom,
                                           n_samples, n_components,
                                           angle=angle, skip_num_points=0,
                                           verbose=0)

        P = squareform(P)
        P_bh = P_bh.toarray()
        assert_array_almost_equal(P_bh, P, decimal=5)
        assert_almost_equal(kl_exact, kl_bh, decimal=3)
Beispiel #49
0
def get_nearest(src_points, candidates, k_neighbors=2):
  """
    converts lat-long coords to great-circle distance and
    returns the two closests
  """

  # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=20, metric='haversine')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    #Get closest indices and distances (i.e. array at index 0)
    #note: for the second closest points, you would take index 1, etc.
    closest = indices[0:2]
    closest_dist = distances[0:2]


    return (closest, closest_dist)
Beispiel #50
0
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2  # Unused, but needs to be integer; assume euclidean

    dim = X.shape[0]
    k = min(dim - 1, k)

    tree = BallTree(X, metric=metric)

    dist_metric = DistanceMetric.get_metric(metric)

    core_distances = tree.query(X, k=k)[0][:, -1]
    min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha)

    single_linkage_tree = label(min_spanning_tree)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    return single_linkage_tree
Beispiel #51
0
def associate(rad_1, rad_2, k_nn=1):
    """
    Given two grids rad_1 and rad_2, this associates each point in rad_2 to the k-nearest neighbours in
    rad_1.
    Pairs of the form [latitude, longitude]
    """

    # Room to improvement:
    # - Run the Ball tree on the smallest net
    # - Use something more efficient than a Ball Tree, like a binary search.

    # Build Ball Tree
    Ball = BallTree(rad_1, metric='haversine')

    # Searching Data
    distances, indices = Ball.query(rad_2,
                                    k=k_nn,
                                    breadth_first=True,
                                    return_distance=True)

    assert rad_2.shape[0] == indices.shape[0]

    return distances, indices
def distance_to_port(lon, lat, ports):
    '''
    Take longitude and latitude and return the distance (km) to the
    closest port, as well as the country of that port, using the World
    Port Index database. This uses a ball tree search approach in
    radians, accounting for the curvature of the Earth by calculating
    the Haversine metric for each pair of points. Note that Haversine
    distance metric expects coordinate pairs in (lat, long) order,
    in radians.
    Arguments:
    lon, lat: Arrays of longitude-latitude pairs of ship locations, in degrees
    ports: shape file of ports
    Returns:
    Pandas dataframe with columns 'shore_country' and 'distance_to_port'
    '''
    ports_flip = np.flip(ports, axis=1)
    coords = pd.concat([np.radians(lat), np.radians(lon)], axis=1)
    tree = BallTree(np.radians(ports_flip), metric='haversine')
    dist, ind = tree.query(coords, k=1)
    df_distance_to_port = pd.Series(
        dist.flatten() * 6371,  # radius of earth (km)
        name='distance_to_port')
    return df_distance_to_port
Beispiel #53
0
class KNN(object):
    """
    The KNN classifier
    """
    def __init__(self, n_neighbors):
        self.n_neighbors = n_neighbors
        self.tree_ = None

    def fit(self, x_train, y_train):
        """
        Fitting the KNN classifier

        Hint:   Build a tree to get neighbors 
                faster at test time
        """
        self.tree_ = BallTree(x_train)
        self.y_train_ = y_train.reset_index(drop=True)
        return self

    def predict(self, x_test):
        """
        Predicting the test data
        Hint:   Get the K-Neighbors, then generate
                predictions using the labels of the
                neighbors
        """
        n_test = x_test.shape[0]

        y_pred = []
        _, indices = self.tree_.query(x_test, k=self.n_neighbors)
        assert (indices.shape[1] == self.n_neighbors)

        for i in range(n_test):
            neighbor_classes = self.y_train_[indices[i]]
            cl = Counter(neighbor_classes).most_common(1)[0][0]
            y_pred.append(cl)
        return np.array(y_pred)
Beispiel #54
0
class hyperSphere:
    def __init__(self, X):
        # constructs hypersphere
        self.nm = X.shape[0]
        self.nn_tree = BallTree(X, leaf_size=16, metric='euclidean')
        nn_dists, nn_ixs = self.nn_tree.query(X, k=2)

        # radii
        eps = 1e-10
        self.radii = nn_dists[:, 1].flatten() + eps

        # isolation scores
        self.scores = 1.0 - (self.radii[nn_ixs[:, 1].flatten()] / self.radii)

    def compute_isolation_score(self, sphere, X):
        # compute isolation score for sample X
        n, _ = X.shape
        scores = np.ones(n, dtype=np.float)

        s_dists, s_ixs = sphere.nn_tree.query(X, k=self.nm)

        for i in range(n):
            cr = self.radii[s_ixs[i, :].flatten()]

            # belongs to these spheres
            ix_m = np.where(s_dists[i, :].flatten() <= cr)[0]

            # does not belong to sphere
            if len(ix_m) == 0:
                continue

            # sphere with smallest radius
            ixs = np.argmin(cr[ix_m])
            ns = s_ixs[i, ixs]
            scores[i] = self.scores[ns]

        return scores
Beispiel #55
0
def index_nn_haversine(centroids, coordinates, threshold=THRESHOLD):
    """Compute the neareast centroid for each coordinate using a Ball
    tree with haversine distance.

    Parameters:
        centroids (2d array): First column contains latitude, second
            column contains longitude. Each row is a geographic point
        coordinates (2d array): First column contains latitude, second
            column contains longitude. Each row is a geographic point
        threshold (float): distance threshold in km over which no neighbor will
            be found. Those are assigned with a -1 index

    Returns:
        array with so many rows as coordinates containing the centroids
            indexes
    """
    # Construct tree from centroids
    tree = BallTree(np.radians(centroids), metric='haversine')
    # Select unique exposures coordinates
    _, idx, inv = np.unique(coordinates, axis=0, return_index=True,
                            return_inverse=True)

    # query the k closest points of the n_points using dual tree
    dist, assigned = tree.query(np.radians(coordinates[idx]), k=1,
                                return_distance=True, dualtree=True,
                                breadth_first=False)

    # Raise a warning if the minimum distance is greater than the
    # threshold and set an unvalid index -1
    num_warn = np.sum(dist * EARTH_RADIUS_KM > threshold)
    if num_warn:
        LOGGER.warning('Distance to closest centroid is greater than %s'
                       'km for %s coordinates.', threshold, num_warn)
        assigned[dist * EARTH_RADIUS_KM > threshold] = -1

    # Copy result to all exposures and return value
    return np.squeeze(assigned[inv])
Beispiel #56
0
def test_barnes_hut_angle():
    # When Barnes-Hut's angle=0 this corresponds to the exact method.
    angle = 0.0
    perplexity = 10
    n_samples = 100
    for n_components in [2, 3]:
        n_features = 5
        degrees_of_freedom = float(n_components - 1.0)

        random_state = check_random_state(0)
        distances = random_state.randn(n_samples, n_features)
        distances = distances.astype(np.float32)
        distances = distances.dot(distances.T)
        np.fill_diagonal(distances, 0.0)
        params = random_state.randn(n_samples, n_components)
        P = _joint_probabilities(distances, perplexity, False)
        kl, gradex = _kl_divergence(params, P, degrees_of_freedom, n_samples,
                                    n_components)

        k = n_samples - 1
        bt = BallTree(distances)
        distances_nn, neighbors_nn = bt.query(distances, k=k + 1)
        neighbors_nn = neighbors_nn[:, 1:]
        Pbh = _joint_probabilities_nn(distances, neighbors_nn, perplexity,
                                      False)
        kl, gradbh = _kl_divergence_bh(params,
                                       Pbh,
                                       neighbors_nn,
                                       degrees_of_freedom,
                                       n_samples,
                                       n_components,
                                       angle=angle,
                                       skip_num_points=0,
                                       verbose=False)
        assert_array_almost_equal(Pbh, P, decimal=5)
        assert_array_almost_equal(gradex, gradbh, decimal=5)
Beispiel #57
0
def detectCloseSensor(src_points, candidates, k_neighbors=1):
    from sklearn.neighbors import BallTree
    """
     
    Find nearest neighbors for all source points from a set of candidate points
    
     Parameters
     ----------
     src_points : sao infrasestruturas
     candidates: sao sensores
    
     Returns
     -------
     TYPE
         DESCRIPTION.
    
     """

    # Create tree from the candidate points
    print("candidates", candidates)
    tree = BallTree(candidates, leaf_size=15, metric='haversine')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    # Get closest indices and distances (i.e. array at index 0)
    # note: for the second closest points, you would take index 1, etc. indices[0] distances[0]
    closest = indices.ravel()
    closest_dist = distances.ravel()

    # Return indices and distances
    return (closest, closest_dist)
Beispiel #58
0
class GraphNearestNode(ContextManager):
    RAD_PER_DEGREE = pi / 180
    EARTH_RADIUS_METERS = 6367.5 * 1e3

    def __init__(self, graph):
        # Point array
        self.X = pd.DataFrame(data=nx.get_node_attributes(graph, "loc")).T
        # Nearest neighbors tree
        self.T = BallTree(self.X.values * self.RAD_PER_DEGREE,
                          metric="haversine")

    def __call__(self, locs):
        # Get nearest nodes: distance to X and index in X
        (d, i) = np.squeeze(
            self.T.query(np.asarray(locs) * self.RAD_PER_DEGREE,
                         k=1,
                         return_distance=True))
        # Note: do not sort the Series
        s = pd.Series(index=(self.X.index[list(map(int, i))]),
                      data=(d * self.EARTH_RADIUS_METERS))
        return s

    def __exit__(self, exc_type, exc_val, exc_tb):
        return None
 def setUp(self):
     self.numPoints = 500
     self.dim = 10
     self.numNhbrs = 3
     self.numQuery = 5
     # Known Numpy implementation:
     np.random.seed(0)
     self.X = np.random.random(
         (self.numPoints, self.dim))  # 10 points in 3 dimensions
     tree = BallTree(self.X, leaf_size=2)
     dist, ind = tree.query(self.X[0:self.numQuery, ], k=self.numNhbrs)
     self.nrstNbhrs = self.X[ind, :]
     self.feeder5 = tf.placeholder(tf.float32, [self.numQuery, self.dim])
     self.feeder1 = tf.placeholder(tf.float32, [1, self.dim])
     with self.test_session() as sess:
         self.testKnn = tfknn(self.X.shape[0], self.X.shape[1], sess)
         for i in range(self.X.shape[0] / 100):
             addOp = self.testKnn.addPoints_np(self.X[100 * i:100 *
                                                      (i + 1), ])
         self.testKnn.compile()
     self.distMatrices = []
     for points in self.nrstNbhrs:
         self.distMatrices.append(distance_matrix(points, points))
     self.distMatrices = np.stack(self.distMatrices)
def get_nearest(src_points, candidates, k_neighbors=1, distance_threshold=None):
    """Find nearest neighbors for all source points from a set of candidate points
    Args:
    src_points: an pandas row with a geometry column
    candidates: pandas df
    k_neighbors: number of neighbors
    distance_threshold = minimum distance in meters
    """

    # Create tree from the candidate points
    coordinates = np.vstack(candidates.geometry.centroid.apply(lambda geom: (geom.x,geom.y)))
    tree = BallTree(coordinates, leaf_size=15, metric='haversine')

    # Find closest points and distances
    #src_points = src_points.reset_index()    
    src_x = src_points.geometry.centroid.x
    src_y = src_points.geometry.centroid.y
    
    src_points = np.array([src_x, src_y]).reshape(-1,2)
    
    #If there are not enough neighbors, reduce K, then pad to original
    if k_neighbors > candidates.shape[0]:
        effective_neighbors = candidates.shape[0]
    else:
        effective_neighbors = k_neighbors
    
    distances, indices = tree.query(src_points, k=effective_neighbors)
    
    neighbor_geoms = candidates[candidates.index.isin(indices[0])]
    neighbor_geoms["distance"] = distances[0]

    if distance_threshold:
        neighbor_geoms = neighbor_geoms[neighbor_geoms.distance > distance_threshold]
    
    # Return indices and distances
    return neighbor_geoms