コード例 #1
0
def test_ball_tree_pickle():
    np.random.seed(0)
    X = np.random.random((10, 3))

    bt1 = BallTree(X, leaf_size=1)
    # Test if BallTree with callable metric is picklable
    bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2)

    ind1, dist1 = bt1.query(X)
    ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X)

    def check_pickle_protocol(protocol):
        s = pickle.dumps(bt1, protocol=protocol)
        bt2 = pickle.loads(s)

        s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol)
        bt2_pyfunc = pickle.loads(s_pyfunc)

        ind2, dist2 = bt2.query(X)
        ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X)

        assert_array_almost_equal(ind1, ind2)
        assert_array_almost_equal(dist1, dist2)

        assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc)
        assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc)

    for protocol in (0, 1, 2):
        yield check_pickle_protocol, protocol
コード例 #2
0
def test_ball_tree_pickle():
    rng = check_random_state(0)
    X = rng.random_sample((10, 3))

    bt1 = BallTree(X, leaf_size=1)
    # Test if BallTree with callable metric is picklable
    bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2)

    ind1, dist1 = bt1.query(X)
    ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X)

    def check_pickle_protocol(protocol):
        s = pickle.dumps(bt1, protocol=protocol)
        bt2 = pickle.loads(s)

        s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol)
        bt2_pyfunc = pickle.loads(s_pyfunc)

        ind2, dist2 = bt2.query(X)
        ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X)

        assert_array_almost_equal(ind1, ind2)
        assert_array_almost_equal(dist1, dist2)

        assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc)
        assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc)

        assert isinstance(bt2, BallTree)

    for protocol in (0, 1, 2):
        check_pickle_protocol(protocol)
コード例 #3
0
def lof(X, k, outlier_threshold=1.5, verbose=False):
    """Knn with KD trees"""
    start = time.time()
    tree = BallTree(X, leaf_size=2)
    distance, index = tree.query(X, k)
    distance, index = distance[:, 1:], index[:, 1:]
    radius = distance[:, -1]
    """Calculate LRD."""
    LRD = np.mean(np.maximum(distance, radius[index]), axis=1)
    r = 1. / np.array(LRD)
    """Calculate outlier score."""
    outlier_score = np.sum(r[index], axis=1) / np.array(r, dtype=np.float16)
    outlier_score *= 1. / k

    # print ('Compute time: %g seconds.' % ((time.time() - start)))

    if verbose:        print("Recording all outliers with outlier score greater than %s." \
              % (outlier_threshold))

    outliers = []
    """ Could parallelize this for loop, but really not worth the overhead...
        Would get insignificant performance gain."""
    for i, score in enumerate(outlier_score):
        if score > outlier_threshold:
            outliers.append([i, X[i], score])

    if verbose:
        print("Detected outliers:")
        print(outliers)

    return outliers
def experiment_setup(sat_pos, altitude, src_pos, gst_pos, min_elev, orbits,
                     sat_per_orbit, terrestrial_gst_graph, path_control):
    sat_sat_dist = compute_sat_sat_distance(sat_pos, altitude, orbits,
                                            sat_per_orbit)

    # Compute the BallTree for the satellites. This gives nn to satellites.
    sat_tree = BallTree(np.deg2rad(sat_pos),
                        metric=DistanceMetric.get_metric("haversine"))

    # Get the satellites that are in reach for the ground stations
    #   and their distance.
    sat_gst_ind, sat_gst_dist = compute_gst_sat_distance(
        altitude, min_elev, gst_pos, sat_tree)

    # Compute the terrestrial nearest neighbors to sources
    src_gst_ind, src_gst_dist = src_nearest_gst_distance(
        src_pos, gst_pos, path_control)

    # Get the terrestrial GST -> GST distance
    gst_gst_terrestrial = gst_gst_terrestrial_distance(terrestrial_gst_graph,
                                                       gst_pos)

    # Get the satellite GST -> GST distance
    gst_gst_satellite = gsts_optimization(sat_gst_ind,
                                          sat_gst_dist,
                                          sat_sat_dist,
                                          n_gsts=gst_pos.shape[0])

    return src_gst_ind, src_gst_dist, gst_gst_terrestrial, gst_gst_satellite
コード例 #5
0
def test_ball_tree_kde(n_samples=100, n_features=3):
    np.random.seed(0)
    X = np.random.random((n_samples, n_features))
    Y = np.random.random((n_samples, n_features))
    bt = BallTree(X, leaf_size=10)

    for kernel in [
            'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear',
            'cosine'
    ]:
        for h in [0.01, 0.1, 1]:
            dens_true = compute_kernel_slow(Y, X, kernel, h)

            def check_results(kernel, h, atol, rtol, breadth_first):
                dens = bt.kernel_density(Y,
                                         h,
                                         atol=atol,
                                         rtol=rtol,
                                         kernel=kernel,
                                         breadth_first=breadth_first)
                assert_allclose(dens,
                                dens_true,
                                atol=atol,
                                rtol=max(rtol, 1e-7))

            for rtol in [0, 1E-5]:
                for atol in [1E-6, 1E-2]:
                    for breadth_first in (True, False):
                        yield (check_results, kernel, h, atol, rtol,
                               breadth_first)
コード例 #6
0
def src_nearest_gst_distance(src_pos, gst_pos, nn=1):
    """INCLUDES PATH STRETCH"""
    gst_tree = BallTree(np.deg2rad(gst_pos),
                        metric=DistanceMetric.get_metric("haversine"))
    src_gst_dist, src_gst_ind = gst_tree.query(np.deg2rad(src_pos), k=nn)
    src_gst_dist = haversine_to_km(src_gst_dist)
    src_gst_dist = src_gst_dist * FIBER_PATH_STRETCH
    return src_gst_ind, src_gst_dist
コード例 #7
0
def test_query_haversine():
    rng = check_random_state(0)
    X = 2 * np.pi * rng.random_sample((40, 2))
    bt = BallTree(X, leaf_size=1, metric='haversine')
    dist1, ind1 = bt.query(X, k=5)
    dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine')

    assert_array_almost_equal(dist1, dist2)
    assert_array_almost_equal(ind1, ind2)
コード例 #8
0
def test_query_haversine():
    np.random.seed(0)
    X = 2 * np.pi * np.random.random((40, 2))
    bt = BallTree(X, leaf_size=1, metric='haversine')
    dist1, ind1 = bt.query(X, k=5)
    dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine')

    assert_array_almost_equal(dist1, dist2)
    assert_array_almost_equal(ind1, ind2)
コード例 #9
0
    def check_neighbors(dualtree, breadth_first, k, metric, kwargs):
        bt = BallTree(X, leaf_size=1, metric=metric, **kwargs)
        dist1, ind1 = bt.query(Y, k, dualtree=dualtree,
                               breadth_first=breadth_first)
        dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)

        # don't check indices here: if there are any duplicate distances,
        # the indices may not match.  Distances should not have this problem.
        assert_array_almost_equal(dist1, dist2)
コード例 #10
0
    def _fit(self, X):
        #use Euclidean metric if possible, or raise error [IY]
        #note that in sompy.project_realdata() the algorithm is set by default
        # (e.g. to 'brute' or 'kd_tree')
        if self.metric_params is None:
            self.effective_metric_params_ = {}
        else:
            self.effective_metric_params_ = self.metric_params.copy()
        if self.metric not in ['euclidean', 'minkowski']:
            raise ValueError("Using Euclidean distance with the wrong metric")
        self.effective_metric_ = self.metric
        # For minkowski distance, use more efficient methods where available
        if self.metric == 'minkowski':
            p = self.effective_metric_params_.pop('p', 2)
            if p == 2:
                self.effective_metric_ = 'euclidean'
            else:
                raise ValueError(
                    "cannot replace Minkowski with Euclidian metric")

        X = check_array(X, accept_sparse='csr')

        n_samples = X.shape[0]
        if n_samples == 0:
            raise ValueError("n_samples must be greater than 0")

        if issparse(X) and self.effective_metric_ not in VALID_METRICS_SPARSE[
                'brute']:
            raise ValueError("metric '%s' not valid for sparse input" %
                             self.effective_metric_)

        self._fit_method = self.algorithm
        self._fit_X = X

        if self._fit_method == 'ball_tree':
            self._tree = BallTree(X,
                                  self.leaf_size,
                                  metric=self.effective_metric_,
                                  **self.effective_metric_params_)
        elif self._fit_method == 'kd_tree':
            self._tree = KDTree(X,
                                self.leaf_size,
                                metric=self.effective_metric_,
                                **self.effective_metric_params_)
        elif self._fit_method == 'brute':
            self._tree = None
        else:
            raise ValueError("algorithm = '%s' not recognized" %
                             self.algorithm)

        if self.n_neighbors is not None:
            if self.n_neighbors <= 0:
                raise ValueError("Expected n_neighbors > 0. Got %d" %
                                 self.n_neighbors)

        return self
コード例 #11
0
    def __init__(self, sen2vec, corpus_path, corpus_vec_path):
        self.sen2vec = sen2vec

        self._corpus = pd.read_csv(corpus_path)
        self._vectors = load_qa_corpus_vec(corpus_vec_path)
        self._indices = []
        X = []
        for i, v in enumerate(self._vectors):
            if any(v):
                self._indices.append(i)
                X.append(v)
        X = np.array(X)
        # 构建balltree
        self.tree = BallTree(X)
コード例 #12
0
def test_gaussian_kde(n_samples=1000):
    # Compare gaussian KDE results to scipy.stats.gaussian_kde
    from scipy.stats import gaussian_kde
    rng = check_random_state(0)
    x_in = rng.normal(0, 1, n_samples)
    x_out = np.linspace(-5, 5, 30)

    for h in [0.01, 0.1, 1]:
        bt = BallTree(x_in[:, None])
        gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))

        dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples
        dens_gkde = gkde.evaluate(x_out)

        assert_array_almost_equal(dens_bt, dens_gkde, decimal=3)
コード例 #13
0
def test_ball_tree_query_metrics(metric):
    rng = check_random_state(0)
    if metric in BOOLEAN_METRICS:
        X = rng.random_sample((40, 10)).round(0)
        Y = rng.random_sample((10, 10)).round(0)
    elif metric in DISCRETE_METRICS:
        X = (4 * rng.random_sample((40, 10))).round(0)
        Y = (4 * rng.random_sample((10, 10))).round(0)

    k = 5

    bt = BallTree(X, leaf_size=1, metric=metric)
    dist1, ind1 = bt.query(Y, k)
    dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
    assert_array_almost_equal(dist1, dist2)
コード例 #14
0
def test_ball_tree_pickle():
    import pickle
    np.random.seed(0)
    X = np.random.random((10, 3))
    bt1 = BallTree(X, leaf_size=1)
    ind1, dist1 = bt1.query(X)

    def check_pickle_protocol(protocol):
        s = pickle.dumps(bt1, protocol=protocol)
        bt2 = pickle.loads(s)
        ind2, dist2 = bt2.query(X)
        assert_array_almost_equal(ind1, ind2)
        assert_array_almost_equal(dist1, dist2)

    for protocol in (0, 1, 2):
        yield check_pickle_protocol, protocol
コード例 #15
0
def test_ball_tree_two_point(n_samples=100, n_features=3):
    np.random.seed(0)
    X = np.random.random((n_samples, n_features))
    Y = np.random.random((n_samples, n_features))
    r = np.linspace(0, 1, 10)
    bt = BallTree(X, leaf_size=10)

    D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
    counts_true = [(D <= ri).sum() for ri in r]

    def check_two_point(r, dualtree):
        counts = bt.two_point_correlation(Y, r=r, dualtree=dualtree)
        assert_array_almost_equal(counts, counts_true)

    for dualtree in (True, False):
        yield check_two_point, r, dualtree
コード例 #16
0
def test_ball_tree_query(metric, k, dualtree, breadth_first):
    rng = check_random_state(0)
    X = rng.random_sample((40, DIMENSION))
    Y = rng.random_sample((10, DIMENSION))

    kwargs = METRICS[metric]

    bt = BallTree(X, leaf_size=1, metric=metric, **kwargs)
    dist1, ind1 = bt.query(Y,
                           k,
                           dualtree=dualtree,
                           breadth_first=breadth_first)
    dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)

    # don't check indices here: if there are any duplicate distances,
    # the indices may not match.  Distances should not have this problem.
    assert_array_almost_equal(dist1, dist2)
コード例 #17
0
    def query(self, X: np.ndarray, k: Optional[int] = None) -> np.ndarray:
        """
        Returns the k nearest neighbors.

        Parameters:
            X: An array of shape (num_samples, num_features).
            k: The number of neighbors to return.

        Returns:
            An array of shape (num_samples, k) and of type int containing the
            indices of the k nearest nodes.
        """
        if k is None:
            k = self._k
        bt = BallTree(self.nodes, metric="euclidean")
        dist, ind = bt.query(X, k)
        return ind
コード例 #18
0
def test_ball_tree_query_radius(n_samples=100, n_features=10):
    np.random.seed(0)
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    bt = BallTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt) ** 2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind = bt.query_radius(query_pt, r + eps)[0]
        i = np.where(rad <= r + eps)[0]

        ind.sort()
        i.sort()

        assert_array_almost_equal(i, ind)
コード例 #19
0
def test_ball_tree_kde(n_samples=100, n_features=3):
    rng = check_random_state(0)
    X = rng.random_sample((n_samples, n_features))
    Y = rng.random_sample((n_samples, n_features))
    bt = BallTree(X, leaf_size=10)

    for kernel in [
            'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear',
            'cosine'
    ]:
        for h in [0.01, 0.1, 1]:
            dens_true = compute_kernel_slow(Y, X, kernel, h)

            for rtol in [0, 1E-5]:
                for atol in [1E-6, 1E-2]:
                    for breadth_first in (True, False):
                        yield (check_results, kernel, h, atol, rtol,
                               breadth_first, bt, Y, dens_true)
コード例 #20
0
def test_gaussian_kde(n_samples=1000):
    """Compare gaussian KDE results to scipy.stats.gaussian_kde"""
    from scipy.stats import gaussian_kde
    np.random.seed(0)
    x_in = np.random.normal(0, 1, n_samples)
    x_out = np.linspace(-5, 5, 30)

    for h in [0.01, 0.1, 1]:
        bt = BallTree(x_in[:, None])
        try:
            gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))
        except TypeError:
            raise SkipTest("Old version of scipy, doesn't accept explicit bandwidth.")

        dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples
        dens_gkde = gkde.evaluate(x_out)

        assert_array_almost_equal(dens_bt, dens_gkde, decimal=3)
コード例 #21
0
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10):
    np.random.seed(0)
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    bt = BallTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt) ** 2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind, dist = bt.query_radius(query_pt, r + eps, return_distance=True)

        ind = ind[0]
        dist = dist[0]

        d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))

        assert_array_almost_equal(d, dist)
コード例 #22
0
ファイル: test_ball_tree.py プロジェクト: turian/scikit-learn
def test_gaussian_kde(n_samples=1000):
    """Compare gaussian KDE results to scipy.stats.gaussian_kde"""
    from scipy.stats import gaussian_kde
    np.random.seed(0)
    x_in = np.random.normal(0, 1, n_samples)
    x_out = np.linspace(-5, 5, 30)

    for h in [0.01, 0.1, 1]:
        bt = BallTree(x_in[:, None])
        try:
            gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))
        except TypeError:
            # older versions of scipy don't accept explicit bandwidth
            raise SkipTest

        dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples
        dens_gkde = gkde.evaluate(x_out)

        assert_allclose(dens_bt, dens_gkde, rtol=1E-3, atol=1E-3)
コード例 #23
0
def optimize_end_to_end_latency_rerouting(sat_pos, altitude, gst_pos, src_pos,
                                          min_elev, orbits, sat_per_orbit,
                                          terrestrial_gst_graph, inactive):
    # Compute satellite graph distances
    sat_sat_dist = compute_sat_sat_distance(sat_pos, altitude, orbits,
                                            sat_per_orbit)

    # Compute the BallTree for the satellites. This gives nn to satellites.
    sat_tree = BallTree(np.deg2rad(sat_pos),
                        metric=DistanceMetric.get_metric("haversine"))

    # Get the satellites that are in reach for the ground stations
    #   and their distance.
    sat_gst_ind, sat_gst_dist = compute_gst_sat_distance(
        altitude, min_elev, gst_pos, sat_tree)

    # Get the terrestrial GST -> GST distance
    gst_gst_terrestrial = gst_gst_terrestrial_distance(terrestrial_gst_graph,
                                                       gst_pos)

    # Get the satellite GST -> GST distance
    gst_gst_satellite = gsts_optimization(sat_gst_ind,
                                          sat_gst_dist,
                                          sat_sat_dist,
                                          n_gsts=gst_pos.shape[0])

    # Compute the closest active GST to the inactive ones
    nearest_active, nearest_active_dist = inactive_to_closest_active(
        inactive, gst_gst_terrestrial)

    # Get the closest GST to every source and its distance
    src_gst_ind, src_gst_dist = src_nearest_gst_distance(src_pos, gst_pos)

    # Put all together and get the src-dst distance matrix
    n_src = src_pos.shape[0]
    src_dst_latency = compute_src_dst_latency(n_src, inactive, src_gst_ind,
                                              src_gst_dist, nearest_active,
                                              nearest_active_dist,
                                              gst_gst_satellite)

    return src_dst_latency, nearest_active
コード例 #24
0
    def rank(self, cs, yc, ls, lss):

        targets = {l: i for (i, l) in enumerate(ls)}

        # Number of results (lemmas) ranked
        n_results = len(yc)

        # Build ball tree model
        ball_tree = BallTree(yc)

        rs = ball_tree.query(cs, k=n_results, return_distance=False)

        rankings = list()

        for i, (ranking, ls) in enumerate(zip(rs, lss)):

            lsm = [targets[l] for l in ls]
            ranking_array = np.array([(1.0 if i in lsm else 0.0) for i in ranking])
            rankings.append(ranking_array)

        return rankings
コード例 #25
0
 def create_image(self, path, max_size, metric):
     """ Match an image with itself finding the closest neighbors within that image """
     # Open image
     img_data = imaging.open_img(path, max_size)
     # Get descriptors
     keypoints, descriptors = matchutil.get_features(img_data)
     # Match
     matches = matchutil.flann_match(descriptors, descriptors, k=2)
     # Distances and positions
     distances = numpy.array([r[1].distance for r in matches])
     positions = numpy.array([k.pt for k in keypoints])
     # build position_tree
     position_tree = BallTree(positions, metric=metric)
     # Collect data
     self.original = {
         "descriptors": descriptors,
         "positions": positions,
         "distances": distances,
         "position_tree": position_tree,
         "size": img_data.shape
     }
コード例 #26
0
def test_ball_tree_kde(kernel,
                       h,
                       rtol,
                       atol,
                       breadth_first,
                       n_samples=100,
                       n_features=3):
    rng = np.random.RandomState(0)
    X = rng.random_sample((n_samples, n_features))
    Y = rng.random_sample((n_samples, n_features))
    bt = BallTree(X, leaf_size=10)

    dens_true = compute_kernel_slow(Y, X, kernel, h)

    dens = bt.kernel_density(Y,
                             h,
                             atol=atol,
                             rtol=rtol,
                             kernel=kernel,
                             breadth_first=breadth_first)
    assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7))
コード例 #27
0
ファイル: base.py プロジェクト: tbaccata/scikit-hubness
    def _fit(self, X):
        self._check_algorithm_metric()
        self._check_hubness_algorithm()
        self._check_algorithm_hubness_compatibility()
        if self.metric_params is None:
            self.effective_metric_params_ = {}
        else:
            self.effective_metric_params_ = self.metric_params.copy()

        effective_p = self.effective_metric_params_.get('p', self.p)
        if self.metric in ['wminkowski', 'minkowski']:
            self.effective_metric_params_['p'] = effective_p

        self.effective_metric_ = self.metric
        # For minkowski distance, use more efficient methods where available
        if self.metric == 'minkowski':
            p = self.effective_metric_params_.pop('p', 2)
            if p <= 0:
                raise ValueError(
                    f"p must be greater than one for minkowski metric, "
                    f"or in ]0, 1[ for fractional norms.")
            elif p == 1:
                self.effective_metric_ = 'manhattan'
            elif p == 2:
                self.effective_metric_ = 'euclidean'
            elif p == np.inf:
                self.effective_metric_ = 'chebyshev'
            else:
                self.effective_metric_params_['p'] = p

        if isinstance(X, NeighborsBase):
            self._fit_X = X._fit_X
            self._tree = X._tree
            self._fit_method = X._fit_method
            self._index = X._index
            self._hubness_reduction = X._hubness_reduction
            return self

        elif isinstance(X, BallTree):
            self._fit_X = X.data
            self._tree = X
            self._fit_method = 'ball_tree'
            return self

        elif isinstance(X, KDTree):
            self._fit_X = X.data
            self._tree = X
            self._fit_method = 'kd_tree'
            return self

        elif isinstance(X, ApproximateNearestNeighbor):
            self._tree = None
            if isinstance(X, PuffinnLSH):
                self._fit_X = X.X_train_
                self._fit_method = 'lsh'
            elif isinstance(X, FalconnLSH):
                self._fit_X = X.X_train_
                self._fit_method = 'falconn_lsh'
            elif isinstance(X, ONNG):
                self._fit_method = 'onng'
            elif isinstance(X, HNSW):
                self._fit_method = 'hnsw'
            elif isinstance(X, RandomProjectionTree):
                self._fit_method = 'rptree'
            self._index = X
            # TODO enable hubness reduction here
            ...
            return self

        X = check_array(X, accept_sparse='csr')

        n_samples = X.shape[0]
        if n_samples == 0:
            raise ValueError(
                f"n_samples must be greater than 0 (but was {n_samples}.")

        if issparse(X):
            if self.algorithm not in ('auto', 'brute'):
                warnings.warn("cannot use tree with sparse input: "
                              "using brute force")
            if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] \
                    and not callable(self.effective_metric_):
                raise ValueError(
                    f"Metric '{self.effective_metric_}' not valid for sparse input. "
                    f"Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) "
                    f"to get valid options. Metric can also be a callable function."
                )
            self._fit_X = X.copy()
            self._tree = None
            self._fit_method = 'brute'
            if self.hubness is not None:
                warnings.warn(
                    f'cannot use hubness reduction with tree: disabling hubness reduction.'
                )
                self.hubness = None
            self._hubness_reduction_method = None
            self._hubness_reduction = NoHubnessReduction()
            return self

        self._fit_method = self.algorithm
        self._fit_X = X
        self._hubness_reduction_method = self.hubness

        if self._fit_method == 'auto':
            # A tree approach is better for small number of neighbors,
            # and KDTree is generally faster when available
            if ((self.n_neighbors is None
                 or self.n_neighbors < self._fit_X.shape[0] // 2)
                    and self.metric != 'precomputed'):
                if self.effective_metric_ in VALID_METRICS['kd_tree']:
                    self._fit_method = 'kd_tree'
                elif (callable(self.effective_metric_)
                      or self.effective_metric_ in VALID_METRICS['ball_tree']):
                    self._fit_method = 'ball_tree'
                else:
                    self._fit_method = 'brute'
            else:
                self._fit_method = 'brute'
            self._index = None

        if self._fit_method == 'ball_tree':
            self._tree = BallTree(X,
                                  self.leaf_size,
                                  metric=self.effective_metric_,
                                  **self.effective_metric_params_)
            self._index = None
        elif self._fit_method == 'kd_tree':
            self._tree = KDTree(X,
                                self.leaf_size,
                                metric=self.effective_metric_,
                                **self.effective_metric_params_)
            self._index = None
        elif self._fit_method == 'brute':
            self._tree = None
            self._index = None
        elif self._fit_method == 'lsh':
            self._index = PuffinnLSH(verbose=self.verbose,
                                     **self.algorithm_params)
            self._index.fit(X)
            self._tree = None
        elif self._fit_method == 'falconn_lsh':
            self._index = FalconnLSH(verbose=self.verbose,
                                     **self.algorithm_params)
            self._index.fit(X)
            self._tree = None
        elif self._fit_method == 'onng':
            self._index = ONNG(verbose=self.verbose, **self.algorithm_params)
            self._index.fit(X)
            self._tree = None
        elif self._fit_method == 'hnsw':
            self._index = HNSW(verbose=self.verbose, **self.algorithm_params)
            self._index.fit(X)
            self._tree = None
        elif self._fit_method == 'rptree':
            self._index = RandomProjectionTree(verbose=self.verbose,
                                               **self.algorithm_params)
            self._index.fit(X)
            self._tree = None  # because it's a tree, but not an sklearn tree...
        else:
            raise ValueError(f"algorithm = '{self.algorithm}' not recognized")

        if self._hubness_reduction_method is None:
            self._hubness_reduction = NoHubnessReduction()
        else:
            n_candidates = self.algorithm_params['n_candidates']
            if 'include_self' in self.kwargs and self.kwargs['include_self']:
                neigh_train = self.kcandidates(X,
                                               n_neighbors=n_candidates,
                                               return_distance=True)
            else:
                neigh_train = self.kcandidates(n_neighbors=n_candidates,
                                               return_distance=True)
            # Remove self distances
            neigh_dist_train = neigh_train[0]  # [:, 1:]
            neigh_ind_train = neigh_train[1]  # [:, 1:]
            if self._hubness_reduction_method == 'ls':
                self._hubness_reduction = LocalScaling(verbose=self.verbose,
                                                       **self.hubness_params)
            elif self._hubness_reduction_method == 'mp':
                self._hubness_reduction = MutualProximity(
                    verbose=self.verbose, **self.hubness_params)
            elif self._hubness_reduction_method == 'dsl':
                self._hubness_reduction = DisSimLocal(verbose=self.verbose,
                                                      **self.hubness_params)
            elif self._hubness_reduction_method == 'snn':
                raise NotImplementedError('feature not yet implemented')
            elif self._hubness_reduction_method == 'simhubin':
                raise NotImplementedError('feature not yet implemented')
            else:
                raise ValueError(
                    f'Hubness reduction algorithm = "{self._hubness_reduction_method}" not recognized.'
                )
            self._hubness_reduction.fit(neigh_dist_train,
                                        neigh_ind_train,
                                        X=X,
                                        assume_sorted=False)

        if self.n_neighbors is not None:
            if self.n_neighbors <= 0:
                raise ValueError(
                    f"Expected n_neighbors > 0. Got {self.n_neighbors:d}")
            else:
                if not np.issubdtype(type(self.n_neighbors), np.integer):
                    raise TypeError(
                        f"n_neighbors does not take {type(self.n_neighbors)} value, "
                        f"enter integer value")

        return self
コード例 #28
0
 def check_neighbors(metric):
     bt = BallTree(X, leaf_size=1, metric=metric)
     dist1, ind1 = bt.query(Y, k)
     dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
     assert_array_almost_equal(dist1, dist2)
コード例 #29
0
 def get_ball_tree_index(X):
     return BallTree(X)
コード例 #30
0
    #print(WandV['pressure'])
    #X = np.array((WandV.values()))
    #print(len(WandV.values()))
    #print(WandV.values())
    import pandas as pd
    df = pd.DataFrame()

    for i in WandV.values():
        #print(pd.DataFrame(i))
        df = df.append(pd.Series(i), ignore_index=True)
    #print("temp head",df.head())
    #print("temp shape", df.shape)

    from sklearn.neighbors.ball_tree import BallTree
    print("KNN ...........")
    tree = BallTree(df, leaf_size=2)
    print("finding neighbor words .....")
    dist, ind = tree.query(df[:1], k=3)  # doctest: +SKIP
    print(ind)  # indices of 3 closest neighbors
    #[0 3 1]
    print(dist)  # distances to 3 closest neighbors
    #[ 0.          0.19662693  0.29473397]

    v1 = df.iloc[0, :]
    v2 = df.iloc[363, :]
    v3 = df.iloc[3774, :]

    V1 = np.array(v1)
    V2 = np.array(v2)
    V3 = np.array(v3)