Example #1
0
def hnswlibTok(X,eps,min_Pts):                  #使用HNSW查找每个数据点的最近邻
    # dim = len(X[0])
    # data_lables=range(len(X))
    # p = hnswlib.Index(space='l2', dim=dim)
    # p.init_index(max_elements=len(X), ef_construction=200, M=20)
    # p.add_items(X,data_lables)
    # p.set_ef(50)
    # labels,distance = p.knn_query(X, k=len(X))       #len(X)

    tree = KDTree(X, leaf_size=50)
    dist, labels = tree.query(X, k=len(X))

    neighbor_list=[]
    omega_list=[]       #核心对象集合
    for i in labels:
        centers=X[i[0]]
        center_neighbor=i
        dist_list=[]
        for j in range(1,len(i)):
            curr=X[i[j]]
            dist = np.sqrt(np.sum(np.square(centers- curr)))
            dist_list.append(dist)

            if dist>eps:                                #找到小于半径的截至索引位置
                center_neighbor=center_neighbor[0:j]
                break
        neighbor_list.append(set(center_neighbor))
        if len(neighbor_list[-1]) >= min_Pts:
            omega_list.append(i[0])  # 将样本加入核心对象集合
    omega_list = set(omega_list)  # 转化为集合便于操作

    return neighbor_list,omega_list
Example #2
0
 def metric(self, X, Y, n_features=None, dist_func=euclidean):
     small, big = (X, Y) if len(X) > len(Y) else (Y, X)
     small = small.reshape(-1, n_features)
     big = big.reshape(-1, n_features)
     kdtree = KDTree(big)
     result, _ = kdtree.query(small)
     result = self.linkage(result)
     return result
Example #3
0
def test_kd_tree_two_point(dualtree):
    n_samples, n_features = (100, 3)
    rng = check_random_state(0)
    X = rng.random_sample((n_samples, n_features))
    Y = rng.random_sample((n_samples, n_features))
    r = np.linspace(0, 1, 10)
    kdt = KDTree(X, leaf_size=10)

    D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
    counts_true = [(D <= ri).sum() for ri in r]

    counts = kdt.two_point_correlation(Y, r=r, dualtree=dualtree)
    assert_array_almost_equal(counts, counts_true)
Example #4
0
def get_bags_of_words(image_paths):
    '''
    This function should take in a list of image paths and calculate a bag of
    words histogram for each image, then return those histograms in an array.

    Inputs:
        image_paths: A Python list of strings, where each string is a complete
                     path to one image on the disk.

    Outputs:
        An nxd numpy matrix, where n is the number of images in image_paths and
        d is size of the histogram built for each image.

    Use the same hog function to extract feature vectors as before (see
    build_vocabulary). It is important that you use the same hog settings for
    both build_vocabulary and get_bags_of_words! Otherwise, you will end up
    with different feature representations between your vocab and your test
    images, and you won't be able to match anything at all!

    After getting the feature vectors for an image, you will build up a
    histogram that represents what words are contained within the image.
    For each feature, find the closest vocab word, then add 1 to the histogram
    at the index of that word. For example, if the closest vector in the vocab
    is the 103rd word, then you should add 1 to the 103rd histogram bin. Your
    histogram should have as many bins as there are vocabulary words.

    Suggested functions: scipy.spatial.distance.cdist, np.argsort,
                         np.linalg.norm, skimage.feature.hog
    '''

    vocab = np.load('vocab.npy')
    print('Loaded vocab from file.')

    #TODO: Implement this function!
    vocab_mat = np.load('vocab.npy')
    vocab_size = len(image_paths)
    tree = KDTree(vocab_mat)
    cluster_SIFT_features = []
    sift = cv2.xfeatures2d.SIFT_create()
    for image_path in tqdm(image_paths, desc='SIFT'):
        image_bag = [0] * vocab_size
        image = cv2.imread(image_path)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        locations, SIFT_features = sift.detectAndCompute(gray, None)
        temp = SIFT_features.tolist()
        nearest_dist, nearest_ind = tree.query(temp, k=1)
        for index in nearest_ind:
            image_bag[int(index)] += 1
        cluster_SIFT_features.append(image_bag)
    return cluster_SIFT_features
Example #5
0
def test_gaussian_kde(n_samples=1000):
    # Compare gaussian KDE results to scipy.stats.gaussian_kde
    from scipy.stats import gaussian_kde
    rng = check_random_state(0)
    x_in = rng.normal(0, 1, n_samples)
    x_out = np.linspace(-5, 5, 30)

    for h in [0.01, 0.1, 1]:
        kdt = KDTree(x_in[:, None])
        gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))

        dens_kdt = kdt.kernel_density(x_out[:, None], h) / n_samples
        dens_gkde = gkde.evaluate(x_out)

        assert_array_almost_equal(dens_kdt, dens_gkde, decimal=3)
Example #6
0
def test_kd_tree_query_radius(n_samples=100, n_features=10):
    rng = check_random_state(0)
    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    kdt = KDTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt)**2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind = kdt.query_radius([query_pt], r + eps)[0]
        i = np.where(rad <= r + eps)[0]

        ind.sort()
        i.sort()

        assert_array_almost_equal(i, ind)
Example #7
0
def test_array_object_type():
    """Check that we do not accept object dtype array."""
    X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
    with pytest.raises(
        ValueError,
        match="setting an array element with a sequence"
    ):
        KDTree(X)
Example #8
0
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10):
    rng = check_random_state(0)
    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    kdt = KDTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt)**2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind, dist = kdt.query_radius([query_pt], r + eps, return_distance=True)

        ind = ind[0]
        dist = dist[0]

        d = np.sqrt(((query_pt - X[ind])**2).sum(1))

        assert_array_almost_equal(d, dist)
Example #9
0
def median_smallest_distance(points, tree=None):
    """Median over all points of the distance to their closest neighbor.

    This gives an idea of the "grid size" of a point dataset.
    """
    points = numpy.array(points)
    if tree is None:
        # points = numpy.unique(points, axis=0)  # Too slow
        points = numpy.array(list(set(tuple(p) for p in points)))
        tree = KDTree(points)

    # Get the minimum distances to neighbors for a sample of points
    rnd = numpy.random.RandomState(89)
    sample_size = min(len(points), 100)
    sample_idx = rnd.choice(len(points), sample_size, replace=False)
    sample = points[sample_idx]
    distances, _ = tree.query(sample, k=2, return_distance=True)

    # Return the median of that
    return numpy.median(distances[:, 1])
Example #10
0
def test_kdtree_picklable_with_joblib():
    """Make sure that KDTree queries work when joblib memmaps.

    Non-regression test for #21685 and #21228."""
    rng = np.random.RandomState(0)
    X = rng.random_sample((10, 3))
    tree = KDTree(X, leaf_size=2)

    # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that
    # use to raise "ValueError: buffer source array is read-only" in a previous
    # version of the Cython code.
    Parallel(n_jobs=2,
             max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X])
Example #11
0
def test_kd_tree_kde(kernel, h):
    n_samples, n_features = (100, 3)
    rng = check_random_state(0)
    X = rng.random_sample((n_samples, n_features))
    Y = rng.random_sample((n_samples, n_features))
    kdt = KDTree(X, leaf_size=10)

    dens_true = compute_kernel_slow(Y, X, kernel, h)

    for rtol in [0, 1E-5]:
        for atol in [1E-6, 1E-2]:
            for breadth_first in (True, False):
                check_results(kernel, h, atol, rtol, breadth_first, Y, kdt,
                              dens_true)
def prepare_data():
    global __tree
    try:
        __tree = pickle.load(
            open(paths.models + 'alternative_actors/actors_kdtree.pkl', "rb"))
    except (OSError, IOError) as e:
        sample_size = 20000
        latent_vector_generator, actors_id = get_latent_vector_generator()
        actors_id = actors_id[0:sample_size]
        vectors = np.array(
            [latent_vector_generator(actor_id) for actor_id in actors_id])
        __tree = KDTree(vectors, leaf_size=2)
        pickle.dump(
            __tree,
            open(paths.models + 'alternative_actors/actors_kdtree.pkl', "wb"))
Example #13
0
def join(
    original_data,
    augment_data_path,
    original_metadata,
    augment_metadata,
    writer,
    left_columns,
    right_columns,
    how='left',
    columns=None,
    agg_functions=None,
    temporal_resolution=None,
):
    """
    Performs a join between original_data (pandas.DataFrame or path to CSV)
    and augment_data (pandas.DataFrame) using left_columns and right_columns.

    The result is written to the writer object.

    Returns the metadata for the result.
    """

    if isinstance(original_data, pd.DataFrame):
        pass
    elif hasattr(original_data, 'read'):
        original_data = pd.read_csv(
            original_data,
            error_bad_lines=False,
            dtype=str,
        )
    else:
        raise TypeError(
            "join() argument 1 should be a file or a DataFrame, got "
            "%r" % type(original_data))

    augment_data_columns = [col['name'] for col in augment_metadata['columns']]

    # only converting data types for columns involved in augmentation
    original_join_columns_idx = []
    augment_join_columns_idx = []
    augment_columns_transform = []
    for left, right in zip(left_columns, right_columns):
        if len(left) == 2 and len(right) == 2:
            # Spatial augmentation
            # Get those columns
            points = original_data.iloc[:, left]
            # De-duplicate
            points = pd.DataFrame(list(set(tuple(p) for p in points.values)))
            # Convert to numeric numpy array
            points = pd.DataFrame({
                'x':
                pd.to_numeric(
                    points.iloc[:, 0],
                    errors='coerce',
                    downcast='float',
                ),
                'y':
                pd.to_numeric(
                    points.iloc[:, 1],
                    errors='coerce',
                    downcast='float',
                ),
            }).values
            # Build KDTree
            tree = KDTree(points)
            # Compute max distance for nearest join
            max_dist = 2 * median_smallest_distance(points, tree)
            logger.info("Using nearest spatial join, max=%r", max_dist)
            # Store transformation
            augment_columns_transform.append((
                right,
                _tree_nearest(tree, max_dist),
            ))

            original_join_columns_idx.extend(left)
            augment_join_columns_idx.extend(right)
        elif len(left) > 1 or len(right) > 1:
            raise AugmentationError("Datamart currently does not support "
                                    "combination of columns for augmentation.")
        else:
            original_join_columns_idx.append(left[0])
            augment_join_columns_idx.append(right[0])

    original_data = set_data_index(
        original_data,
        original_join_columns_idx,
        original_metadata['columns'],
        drop=False,  # Keep the values of join columns from this side
    )

    # Add a column of unique indices which will be used to aggregate
    original_data[UNIQUE_INDEX_KEY] = pd.RangeIndex(len(original_data))

    logger.info("Performing join...")

    # Stream the data in
    augment_data_chunks = pd.read_csv(
        augment_data_path,
        error_bad_lines=False,
        chunksize=CHUNK_SIZE_ROWS,
    )
    try:
        first_augment_data = next(augment_data_chunks)
    except StopIteration:
        raise AugmentationError("Empty augmentation data")

    # Columns to drop
    drop_columns = None
    if columns:
        drop_columns = list(
            # Drop all the columns in augment_data
            set(augment_data_columns[c] for c in columns)
            # except
            - (
                # the requested columns
                set(columns)
                # and the join columns
                | {col[0]
                   for col in right_columns}))

    # Defer temporal alignment until reading the first block from companion
    # (and converting it to the right data types!)
    update_idx = None
    original_data_res = None

    # Streaming join
    start = time.perf_counter()
    join_ = []
    # Iterate over chunks of augment data
    for augment_data in itertools.chain([first_augment_data],
                                        augment_data_chunks):
        # Run transforms
        for cols, transform in augment_columns_transform:
            augment_data.iloc[:, cols] = transform(augment_data.iloc[:, cols])

        # Convert data types
        augment_data = set_data_index(
            augment_data,
            augment_join_columns_idx,
            augment_metadata['columns'],
            drop=True,  # Drop the join columns on that side (avoid duplicates)
        )

        if update_idx is None:
            # Guess temporal resolutions (on first chunk)
            update_idx = match_temporal_resolutions(
                original_data,
                augment_data,
                temporal_resolution,
            )
            original_data_res = original_data.set_index(
                update_idx(original_data.index))

        # Match temporal resolutions
        augment_data.index = update_idx(augment_data.index)

        # Filter columns
        if drop_columns:
            augment_data = augment_data.drop(drop_columns, axis=1)

        # Join
        joined_chunk = original_data_res.join(augment_data,
                                              how=how,
                                              rsuffix='_r')

        # Drop the join columns we set as index
        joined_chunk.reset_index(drop=True, inplace=True)

        join_.append(joined_chunk)

    join_ = pd.concat(join_)
    logger.info("Join completed in %.4fs", time.perf_counter() - start)

    intersection = set(original_data.columns).intersection(
        set(first_augment_data.columns))

    # qualities
    qualities_list = []

    # map column names for the augmentation data
    augment_columns_map = {
        name: name + '_r' if name in intersection else name
        for name in first_augment_data.columns
    }

    # aggregations
    join_ = perform_aggregations(
        join_,
        list(original_data.columns),
        agg_functions,
        augment_columns_map,
    )

    # drop unique index
    join_.drop([UNIQUE_INDEX_KEY], axis=1, inplace=True)

    original_columns_set = set(original_data.columns)
    new_columns = [
        col for col in join_.columns if col not in original_columns_set
    ]
    qualities_list.append(
        dict(qualName='augmentation_info',
             qualValue=dict(new_columns=new_columns,
                            removed_columns=[],
                            nb_rows_before=original_data.shape[0],
                            nb_rows_after=join_.shape[0],
                            augmentation_type='join'),
             qualValueType='dict'))

    with WriteCounter(writer.open_file('w')) as fout:
        join_.to_csv(fout, index=False, line_terminator='\r\n')
        size = fout.size

    # Build a dict of information about all columns
    columns_metadata = dict()
    for column in augment_metadata['columns']:
        for agg in itertools.chain(
            (None, ),
                AGGREGATION_FUNCTIONS,
        ):
            for name in (column['name'], column['name'] + '_r'):
                column_metadata = {
                    k: v
                    for k, v in column.items() if k in KEEP_COLUMN_FIELDS
                }
                if agg is not None:
                    name = agg + ' ' + name
                column_metadata['name'] = name
                if agg in {'sum', 'mean'}:
                    column_metadata['structural_type'] = types.FLOAT
                    column_metadata['semantic_types'] = []
                elif agg == 'count':
                    column_metadata['structural_type'] = types.INTEGER
                    column_metadata['semantic_types'] = []
                columns_metadata[name] = column_metadata
    for column in original_metadata['columns']:
        columns_metadata[column['name']] = column

    # Then construct column metadata by looking them up in the dict
    columns_metadata = [columns_metadata[name] for name in join_.columns]

    return {
        'columns': columns_metadata,
        'size': size,
        'qualities': qualities_list,
    }
Example #14
0
    def fit(self, X, y):
        if Version(sklearn_version) >= Version("1.0"):
            self._check_feature_names(X, reset=True)
        if self.metric_params is not None and 'p' in self.metric_params:
            if self.p is not None:
                warnings.warn(
                    "Parameter p is found in metric_params. "
                    "The corresponding parameter from __init__ "
                    "is ignored.",
                    SyntaxWarning,
                    stacklevel=2)
            self.effective_metric_params_ = self.metric_params.copy()
            effective_p = self.metric_params["p"]
        else:
            self.effective_metric_params_ = {}
            effective_p = self.p

        if self.metric in ["minkowski"]:
            if effective_p < 1:
                raise ValueError(
                    "p must be greater or equal to one for minkowski metric")
            self.effective_metric_params_["p"] = effective_p

        self.effective_metric_ = self.metric
        # For minkowski distance, use more efficient methods where available
        if self.metric == "minkowski":
            p = self.effective_metric_params_.pop("p", 2)
            if p < 1:
                raise ValueError(
                    "p must be greater or equal to one for minkowski metric")
            if p == 1:
                self.effective_metric_ = "manhattan"
            elif p == 2:
                self.effective_metric_ = "euclidean"
            elif p == np.inf:
                self.effective_metric_ = "chebyshev"
            else:
                self.effective_metric_params_["p"] = p

        if self.metric == "manhattan":
            self.p = 1

        if not isinstance(X, (KDTree, BallTree, sklearn_NeighborsBase)):
            self._fit_X = _check_array(X,
                                       dtype=[np.float64, np.float32],
                                       accept_sparse=True)
            self.n_samples_fit_ = _num_samples(self._fit_X)
            self.n_features_in_ = _num_features(self._fit_X)

            if self.algorithm == "auto":
                # A tree approach is better for small number of neighbors or small
                # number of features, with KDTree generally faster when available
                is_n_neighbors_valid_for_brute = self.n_neighbors is not None and \
                    self.n_neighbors >= self._fit_X.shape[0] // 2
                if self._fit_X.shape[1] > 15 or is_n_neighbors_valid_for_brute:
                    self._fit_method = "brute"
                else:
                    if self.effective_metric_ in VALID_METRICS["kd_tree"]:
                        self._fit_method = "kd_tree"
                    elif callable(self.effective_metric_) or \
                        self.effective_metric_ in \
                            VALID_METRICS["ball_tree"]:
                        self._fit_method = "ball_tree"
                    else:
                        self._fit_method = "brute"
            else:
                self._fit_method = self.algorithm

        if hasattr(self, '_onedal_estimator'):
            delattr(self, '_onedal_estimator')
        # To cover test case when we pass patched
        # estimator as an input for other estimator
        if isinstance(X, sklearn_NeighborsBase):
            self._fit_X = X._fit_X
            self._tree = X._tree
            self._fit_method = X._fit_method
            self.n_samples_fit_ = X.n_samples_fit_
            self.n_features_in_ = X.n_features_in_
            if hasattr(X, '_onedal_estimator'):
                if self._fit_method == "ball_tree":
                    X._tree = BallTree(
                        X._fit_X,
                        self.leaf_size,
                        metric=self.effective_metric_,
                        **self.effective_metric_params_,
                    )
                elif self._fit_method == "kd_tree":
                    X._tree = KDTree(
                        X._fit_X,
                        self.leaf_size,
                        metric=self.effective_metric_,
                        **self.effective_metric_params_,
                    )
                elif self._fit_method == "brute":
                    X._tree = None
                else:
                    raise ValueError("algorithm = '%s' not recognized" %
                                     self.algorithm)

        elif isinstance(X, BallTree):
            self._fit_X = X.data
            self._tree = X
            self._fit_method = 'ball_tree'
            self.n_samples_fit_ = X.data.shape[0]
            self.n_features_in_ = X.data.shape[1]

        elif isinstance(X, KDTree):
            self._fit_X = X.data
            self._tree = X
            self._fit_method = 'kd_tree'
            self.n_samples_fit_ = X.data.shape[0]
            self.n_features_in_ = X.data.shape[1]

        dispatch(
            self, 'neighbors.KNeighborsClassifier.fit', {
                'onedal': self.__class__._onedal_fit,
                'sklearn': sklearn_KNeighborsClassifier.fit,
            }, X, y)
        return self