Example #1
0
    def fit(self,
            descriptors: Iterable[DescriptorElement],
            use_multiprocessing: bool = True) -> np.ndarray:
        """
        Fit the ITQ model given the input set of descriptors

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :param use_multiprocessing: If multiprocessing should be used, as
            opposed to threading, for collecting descriptor vectors from the
            provided iterable.

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash code vectors (boolean-typed) for provided
            descriptors in order.

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        pr = None
        if LOG.getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
            pr = ProgressReporter(LOG.debug, dbg_report_interval).start()
        if not hasattr(descriptors, "__len__"):
            LOG.info("Creating sequence from iterable")
            descriptors_l = []
            for d in descriptors:
                descriptors_l.append(d)
                dbg_report_interval and pr.increment_report()  # type: ignore
            dbg_report_interval and pr.report()  # type: ignore
            descriptors = descriptors_l
        LOG.info("Creating matrix of descriptors for fitting")
        x = np.asarray(
            list(
                parallel_map(lambda d_: d_.vector(),
                             descriptors,
                             use_multiprocessing=use_multiprocessing)))
        LOG.debug("descriptor matrix shape: %s", x.shape)
        n, dim = x.shape

        LOG.debug("Generating random projections")
        np.random.seed(self.random_seed)
        self.rps = np.random.randn(dim, self.bit_length)

        LOG.debug(
            f"Info normalizing descriptors with norm type: {self.normalize}")
        return self.get_hash(x)
Example #2
0
    def perturb(self, ref_image: np.ndarray) -> np.ndarray:
        input_size = np.shape(ref_image)[:2]
        num_masks = self.n
        grid = self.grid
        s = self.s
        shift_rng = np.random.default_rng(self.seed)
        # Shape format: [H x W], Inherits from `input_size`
        cell_size = np.ceil(np.array(input_size) / s)
        up_size = (s + 1) * cell_size

        masks = np.empty((num_masks, *input_size), dtype=grid.dtype)

        # Expanding index accesses for repetition efficiency.
        cell_h, cell_w = cell_size[:2]
        input_h, input_w = input_size[:2]

        def work_func(i_: int) -> np.ndarray:
            # Random shifts
            y = shift_rng.integers(0, cell_h)
            x = shift_rng.integers(0, cell_w)
            mask = resize(grid[i_],
                          up_size,
                          order=1,
                          mode='reflect',
                          anti_aliasing=False)[y:y + input_h, x:x + input_w]
            return mask

        threads = self.threads
        if threads is None or threads < 1:
            for i in range(num_masks):
                masks[i, ...] = work_func(i)
        else:
            for i, m in enumerate(
                    parallel_map(
                        work_func,
                        range(num_masks),
                        cores=threads,
                        use_multiprocessing=False,
                    )):
                masks[i, ...] = m

        return masks
Example #3
0
    def perturb(self, ref_image: PIL.Image.Image) -> np.ndarray:
        input_size = (ref_image.height, ref_image.width)
        num_masks = self.n
        grid = self.grid
        s = self.s
        shift_rng = np.random.default_rng(self.seed)
        cell_size = np.ceil(np.array(input_size) / s)
        up_size = (s + 1) * cell_size

        masks = np.empty((num_masks, *input_size), dtype=grid.dtype)

        def work_func(i_: int) -> np.ndarray:
            # Random shifts
            x = shift_rng.integers(0, cell_size[0])
            y = shift_rng.integers(0, cell_size[1])
            mask = resize(grid[i_],
                          up_size,
                          order=1,
                          mode='reflect',
                          anti_aliasing=False)[x:x + input_size[0],
                                               y:y + input_size[1]]
            return mask

        threads = self.threads
        if threads is None or threads < 1:
            for i in range(num_masks):
                masks[i, ...] = work_func(i)
        else:
            for i, m in enumerate(
                    parallel_map(
                        work_func,
                        range(num_masks),
                        cores=threads,
                        use_multiprocessing=False,
                    )):
                masks[i, ...] = m

        return masks
Example #4
0
    def _generate_arrays(
            self, data_iter: Iterable[DataElement]) -> Iterable[numpy.ndarray]:
        """
        Inner template method that defines the generation of descriptor vectors
        for a given iterable of data elements.

        Pre-conditions:
          - Data elements input to this method have been validated to be of at
            least one of this class's reported ``valid_content_types``.

        :param collections.abc.Iterable[DataElement] data_iter:
            Iterable of data element instances to be described.

        :raises RuntimeError: Descriptor extraction failure of some kind.

        :return: Iterable of numpy arrays in parallel association with the
            input data elements.
        :rtype: collections.abc.Iterable[numpy.ndarray]
        """
        assert self.network is not None, (
            "A network should be initialized by now.")

        self._set_caffe_mode()
        log_debug = LOG.debug

        # Start parallel operation to pre-process imagery before aggregating
        # for network execution.
        # TODO: update ``buffer_factor`` param to account for batch size?
        img_array_iter = \
            parallel_map(_process_load_img_array,
                         zip(
                             data_iter, itertools.repeat(self.transformer),
                             itertools.repeat(self.data_layer),
                             itertools.repeat(self.load_truncated_images),
                             itertools.repeat(self.pixel_rescale),
                         ),
                         ordered=True, cores=self.threads)

        # Aggregate and process batches of input data elements
        #: :type: list[numpy.ndarray]
        batch_img_arrays = \
            list(itertools.islice(img_array_iter, self.batch_size))
        batch_i = 0
        while len(batch_img_arrays) > 0:
            cur_batch_size = len(batch_img_arrays)
            log_debug("Batch {} - size {}".format(batch_i, cur_batch_size))

            log_debug("Updating network data layer shape ({} images)".format(
                cur_batch_size))
            self.network.blobs[self.data_layer].reshape(
                cur_batch_size, *self.net_data_shape[1:4])
            log_debug(
                "Loading image matrices into network layer '{:s}'".format(
                    self.data_layer))
            self.network.blobs[self.data_layer].data[...] = batch_img_arrays
            log_debug("Moving network forward")
            self.network.forward()
            descriptor_list = self.network.blobs[self.return_layer].data
            log_debug("extracting return layer '{:s}' into vectors".format(
                self.return_layer))
            for v in descriptor_list:
                if v.ndim > 1:
                    # In case caffe generates multidimensional array
                    # like (rows, 1, 1)
                    log_debug("- Raveling output array of shape {}".format(
                        v.shape))
                    yield numpy.ravel(v)
                else:
                    yield v

            # Slice out the next batch
            #: :type: list[(collections.abc.Hashable, numpy.ndarray)]
            batch_img_arrays = \
                list(itertools.islice(img_array_iter, self.batch_size))
            batch_i += 1
Example #5
0
    def fit(self,
            descriptors: Iterable[DescriptorElement],
            use_multiprocessing: bool = True) -> np.ndarray:
        """
        Fit the ITQ model given the input set of descriptors.

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.abc.Iterable[smqtk.representation.DescriptorElement]

        :param use_multiprocessing: If multiprocessing should be used, as
            opposed to threading, when collecting descriptor elements from the
            given iterable.
        :type use_multiprocessing: bool

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes (boolean-valued) for provided descriptors in
            parallel order to input descriptors.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = 1.0
        dbg_report = LOG.getEffectiveLevel() <= logging.DEBUG
        if not isinstance(descriptors, Sequence):
            LOG.info("Creating sequence from iterable")
            descriptors_l = []
            pr = ProgressReporter(LOG.debug, dbg_report_interval).start()
            for d in descriptors:
                descriptors_l.append(d)
                dbg_report and pr.increment_report()  # type: ignore
            dbg_report and pr.report()  # type: ignore
            descriptors = descriptors_l
        if len(descriptors[0].vector()) < self.bit_length:
            raise ValueError("Input descriptors have fewer features than "
                             "requested bit encoding. Hash codes will be "
                             "smaller than requested due to PCA decomposition "
                             "result being bound by number of features.")

        LOG.info("Creating matrix of descriptors for fitting")
        x = np.asarray(
            list(
                parallel_map(lambda d_: d_.vector(),
                             descriptors,
                             use_multiprocessing=use_multiprocessing)))
        LOG.debug("descriptor matrix shape: %s", x.shape)

        LOG.debug(f"Info normalizing descriptors by factor: {self.normalize}")
        x = self._norm_vector(x)

        LOG.info("Centering data")
        self.mean_vec = np.mean(x, axis=0)
        x -= self.mean_vec

        LOG.info("Computing PCA transformation")
        LOG.debug("-- computing covariance")
        # ``cov`` wants each row to be a feature and each column an observation
        # of those features. Thus, each column should be a descriptor vector,
        # thus we need the transpose here.
        c = np.cov(x.transpose())

        # Direct translation from UNC matlab code
        # - eigen vectors are the columns of ``pc``
        LOG.debug('-- computing linalg.eig')
        l, pc = np.linalg.eig(c)
        LOG.debug('-- ordering eigen vectors by descending eigen value')

        # # Harry translation of original matlab code
        # # - Uses singular values / vectors, not eigen
        # # - singular vectors are the columns of pc
        # LOG.debug('-- computing linalg.svd')
        # pc, l, _ = np.linalg.svd(c)
        # LOG.debug('-- ordering singular vectors by descending '
        #                 'singular value')

        # Same ordering method for both eig/svd sources.
        l_pc_ordered = sorted(zip(l, pc.transpose()),
                              key=lambda _p: _p[0],
                              reverse=True)

        LOG.debug("-- top vector extraction")
        # Only keep the top ``bit_length`` vectors after ordering by descending
        # value magnitude.
        # - Transposing vectors back to column-vectors.
        pc_top = np.array([p[1] for p in l_pc_ordered[:self.bit_length]])\
            .transpose()
        LOG.debug("-- project centered data by PC matrix")
        v = np.dot(x, pc_top)

        LOG.info("Performing ITQ to find optimal rotation")
        c, self.rotation = self._find_itq_rotation(v, self.itq_iterations)
        # De-adjust rotation with PC vector
        self.rotation = np.dot(pc_top, self.rotation)

        self.save_model()

        return c
Example #6
0
    def _classify_arrays(self, array_iter):
        if not self.has_model():
            raise RuntimeError("No SVM model present for classification")
        assert self.svm_model is not None, (
            "Should have an SVM model at this point.")

        # Dump descriptors into a matrix for normalization and use in
        # prediction.
        vec_mat = numpy.array(list(array_iter))
        vec_mat = self._norm_vector(vec_mat)
        n_jobs = self.n_jobs
        if n_jobs is not None:
            n_jobs = min(len(vec_mat), n_jobs)
        # Else: `n_jobs` is `None`, which is OK as it's the default  value for
        # parallel_map.

        svm_label_map = self.svm_label_map
        c_base = dict((la, 0.) for la in svm_label_map.values())

        # Effectively reproducing the body of svmutil.svm_predict in order to
        # simplify and get around excessive prints
        svm_type = self.svm_model.get_svm_type()
        nr_class = self.svm_model.get_nr_class()
        # Model internal labels. Parallel to ``prob_estimates`` array.
        svm_model_labels = self.svm_model.get_labels()

        # TODO: Normalize input arrays in batch(es). TEST if current norm
        #       function can just take a matrix?

        if self.svm_model.is_probability_model():
            # noinspection PyUnresolvedReferences
            if svm_type in [svm.NU_SVR, svm.EPSILON_SVR]:
                nr_class = 0

            def single_pred(v):
                prob_estimates = (ctypes.c_double * nr_class)()
                v, idx = svm.gen_svm_nodearray(v.tolist())
                svm.libsvm.svm_predict_probability(self.svm_model, v,
                                                   prob_estimates)
                c = dict(c_base)  # Shallow copy
                c.update({
                    svm_label_map[label]: prob
                    for label, prob in zip(svm_model_labels,
                                           prob_estimates[:nr_class])
                })
                return c

            # If n_jobs == 1, just be serial
            if n_jobs == 1:
                return (single_pred(v) for v in vec_mat)
            else:
                return parallel_map(single_pred,
                                    vec_mat,
                                    cores=n_jobs,
                                    use_multiprocessing=True)

        else:
            # noinspection PyUnresolvedReferences
            if svm_type in (svm.ONE_CLASS, svm.EPSILON_SVR, svm.NU_SVC):
                nr_classifier = 1
            else:
                nr_classifier = nr_class * (nr_class - 1) // 2

            def single_label(v):
                dec_values = (ctypes.c_double * nr_classifier)()
                v, idx = svm.gen_svm_nodearray(v.tolist())
                label = svm.libsvm.svm_predict_values(self.svm_model, v,
                                                      dec_values)
                c = dict(c_base)  # Shallow copy
                c[svm_label_map[label]] = 1.
                return c

            # If n_jobs == 1, just be serial
            if n_jobs == 1:
                return (single_label(v) for v in vec_mat)
            else:
                return parallel_map(single_label,
                                    vec_mat,
                                    cores=n_jobs,
                                    use_multiprocessing=True)
Example #7
0
    def _build_index(self, descriptors: Iterable[DescriptorElement]) -> None:
        """
        Internal method to be implemented by sub-classes to build the index
        with the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the
                old cache away.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.abc.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            # Not caring about restoring the index because we're just making a
            # new one.
            LOG.info("Building new FLANN index")

            LOG.debug("Caching descriptor elements")
            self._descr_cache = list(descriptors)
            # Cache descriptors if we have an element
            if self._descr_cache_elem and self._descr_cache_elem.writable():
                LOG.debug(f"Caching descriptors: {self._descr_cache_elem}")
                self._descr_cache_elem.set_bytes(
                    pickle.dumps(self._descr_cache, -1))

            params = {
                "target_precision":
                self._build_target_precision,
                "sample_fraction":
                self._build_sample_frac,
                "log_level":
                ("info"
                 if LOG.getEffectiveLevel() <= logging.DEBUG else "warning")
            }
            if self._build_autotune:
                params['algorithm'] = "autotuned"
            if self._rand_seed is not None:
                params['random_seed'] = self._rand_seed
            pyflann.set_distance_type(self._distance_method)

            LOG.debug("Accumulating descriptor vectors into matrix for FLANN")
            pts_array = numpy.asarray(
                list(parallel_map(lambda d_: d_.vector(), self._descr_cache)))

            LOG.debug('Building FLANN index')
            self._flann = pyflann.FLANN()
            self._flann_build_params = self._flann.build_index(
                pts_array, **params)
            del pts_array

            if self._index_elem and self._index_elem.writable():
                LOG.debug("Caching index: %s", self._index_elem)
                # FLANN wants to write to a file, so make a temp file, then
                # read it in, putting bytes into element.
                fd, fp = tempfile.mkstemp()
                try:
                    self._flann.save_index(fp)
                    # Use the file descriptor to create the file object.
                    # This avoids reopening the file and will automatically
                    # close the file descriptor on exiting the with block.
                    # fdopen() is required because in Python 2 open() does
                    # not accept a file descriptor.
                    with os.fdopen(fd, 'rb') as f:
                        self._index_elem.set_bytes(f.read())
                finally:
                    os.remove(fp)
            if self._index_param_elem and self._index_param_elem.writable():
                LOG.debug(f"Caching index params: {self._index_param_elem}")
                state = {
                    'b_autotune': self._build_autotune,
                    'b_target_precision': self._build_target_precision,
                    'b_sample_frac': self._build_sample_frac,
                    'distance_method': self._distance_method,
                    'flann_build_params': self._flann_build_params,
                }
                self._index_param_elem.set_bytes(pickle.dumps(state, -1))

            self._pid = multiprocessing.current_process().pid
Example #8
0
def occlude_image_streaming(
    ref_image: np.ndarray,
    masks: Iterable[np.ndarray],
    fill: Optional[Union[int, Sequence[int], np.ndarray]] = None,
    threads: Optional[int] = None,
) -> Generator[np.ndarray, None, None]:
    """
    Apply a number of input occlusion masks to the given reference image,
    producing a list of images equivalent in length, and parallel in order, to
    the input masks.

    We expect the "mask" matrices and the image to be the same height and
    width, and for the mask matrix values to be in the [0, 1] range.
    In the mask matrix, values closer to 1 correspond to regions of the image
    that should *NOT* be occluded.
    E.g. a 0 in the mask will translate to *fully* occluding the corresponding
    location in the source image.

    We optionally take in a "fill" that alpha-blend into masked regions of the
    input `ref_image`.
    `fill` may be either a scalar, sequence of scalars, or another image matrix
    congruent in shape to the `ref_image`.
    When `fill` is a scalar or a sequence of scalars, the scalars should be in
    the same data-type and value range as the input image.
    A sequence of scalars should be the same length as there are channels in
    the `ref_image`.
    When `fill` is an image matrix it should follow the format of `[H x W]` or
    `[H x W x C]`, should be in the same dtype and value range as `ref_image`
    and should match the same number of channels if channels are provided.
    When no fill is passed, black is used (default absence of color).

    Images output will mirror the input image format. As such, the `fill` value
    passed must be compatible with the input image channels for broadcasting.
    For example, a single channel input will not be able to be broadcast
    against a multi-channel `fill` input. A ValueError will be raised by the
    underlying numpy call in such cases.

    Assumptions:
      * Mask input is per-pixel. Does not accept per-channel masks.
      * Fill value input is in an applicable value range supported by the input
        image format, which is mirrored in output images.

    :param ref_image: Original base image
    :param masks: Mask images in the [N, Height, Weight] shape format.
    :param fill: Optional fill for alpha-blending based on the input masks for
        the occluded regions as a scalar value, a per-channel sequence or a
        shape-matched image.
    :param threads: Optional number of threads to use for parallelism when set
        to a positive integer. If 0, a negative value, or `None`, work will be
        performed on the main-thread in-line.

    :raises ValueError: One or more input masks in the input iterable did not
        match shape of the input reference image.

    :return: A generator of numpy array masked images.
    """
    # Just the [H x W] component.
    img_shape = ref_image.shape[:2]
    s: Tuple = (..., )
    if ref_image.ndim > 2:
        s = (..., None)  # add channel axis for multiplication

    def work_func(i_: int, m: np.ndarray) -> np.ndarray:
        m_shape = m.shape
        if m_shape != img_shape:
            raise ValueError(
                f"Input mask (position {i_}) did not the shape of the input "
                f"image: {m_shape} != {img_shape}")
        img_m = np.empty_like(ref_image)
        if fill is not None:
            np.add((m[s] * ref_image), ((UINT8_ONE - m[s]) * fill),
                   out=img_m,
                   casting="unsafe")
        else:
            np.multiply(m[s], ref_image, out=img_m, casting="unsafe")
        return img_m

    if threads is None or threads < 1:
        for i, mask in enumerate(masks):
            yield work_func(i, mask)
    else:
        for img in parallel_map(
                work_func,
                itertools.count(),
                masks,
                cores=threads,
                use_multiprocessing=False,
        ):
            yield img
Example #9
0
    def _nn(
        self,
        d: DescriptorElement,
        n: int = 1
    ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]:
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :param n: Number of nearest neighbors to find.

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.

        """
        LOG.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v: numpy.ndarray) -> float:
            return self._distance_function(d_v, d2_v)

        with self._model_lock:
            LOG.debug("getting near hashes")
            hi = self.hash_index
            if hi is None:
                # Make on-the-fly linear index
                hi = LinearHashIndex()
                # not calling ``build_index`` because we already have the int
                # hashes.
                hi.index = set(cast(Iterator[int], self.hash2uuids_kvstore.keys()))
            near_hashes, _ = hi.nn(d_h, n)

            LOG.debug("getting UUIDs of descriptors for nearby hashes")
            neighbor_uuids: List[Hashable] = []
            for h_int in map(bit_vector_to_int_large, near_hashes):
                # If descriptor hash not in our map, we effectively skip it.
                # Get set of descriptor UUIDs for a hash code.
                near_uuids: Set[Hashable] = self.hash2uuids_kvstore.get(h_int, set())
                # Accumulate matching descriptor UUIDs to a list.
                neighbor_uuids.extend(near_uuids)
            LOG.debug("-- matched %d UUIDs", len(neighbor_uuids))

            LOG.debug("getting descriptors for neighbor_uuids")
            neighbors = \
                list(self.descriptor_set.get_many_descriptors(neighbor_uuids))

        # Done with model parts at this point, so releasing lock.

        LOG.debug(f"ordering descriptors via distance method {self.distance_method}")
        LOG.debug('-- getting element vectors')
        neighbor_vectors = numpy.asarray(list(
            parallel_map(lambda d_: d_.vector(), neighbors)
        ))
        LOG.debug('-- calculating distances')
        distances = list(map(comp_descr_dist, neighbor_vectors))
        LOG.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances),
                         key=lambda p: p[1])
        LOG.debug(f'-- slicing top n={n}')
        r_descrs: Tuple[DescriptorElement, ...]
        r_dists: Tuple[float, ...]
        r_descrs, r_dists = zip(*(ordered[:n]))
        return r_descrs, r_dists
Example #10
0
    def _build_multiple_trees(self, chunk_size: int = CHUNK_SIZE) -> None:
        """
        Build an MRPT structure
        """
        sample = next(self._descriptor_set.iterdescriptors())
        sample_v = sample.vector()
        if sample_v is None:
            raise RuntimeError(
                "Sample descriptor element from the current set had no vector "
                "content!")
        n = self.count()
        d = sample_v.size
        leaf_size = n / (1 << self._depth)

        nt = self._num_trees
        depth = self._depth
        LOG.debug(f"Building {nt} trees (T) of depth {depth} (l) "
                  f"from {n:g} descriptors (N) of length {d:g}")
        LOG.debug(
            f"Leaf size             (L = N/2^l)  ~ {n:g}/2^{depth:d} = {leaf_size:g}"
        )
        LOG.debug(
            f"UUIDs stored                (T*N)  = {nt:g} * {n:g} = {nt*n:g}")
        LOG.debug(
            f"Examined UUIDs              (T*L)  ~ {nt:g} * {leaf_size:g} = {nt*leaf_size:g}"
        )
        LOG.debug(
            f"Examined/DB size  (T*L/N = T/2^l)  ~ {nt*leaf_size}/{n} = {nt*leaf_size/n:.3f}"
        )

        if (1 << self._depth) > n:
            LOG.warning(
                f"There are insufficient elements ({n:d} < 2^{depth:d}) to "
                f"populate all the leaves of the tree. Consider lowering the "
                f"depth parameter.")

        LOG.debug("Projecting onto random bases")
        # Build all the random bases and the projections at the same time
        # (_num_trees * _depth shouldn't really be that high -- if it is,
        # you're a monster)
        rs = np.random.RandomState()
        if self._rand_seed is not None:
            rs.seed(self._rand_seed)
        random_bases = rs.randn(self._num_trees, d, self._depth)
        projs = np.empty((n, self._num_trees, self._depth), dtype=np.float64)
        # Load the data in chunks (because n * d IS high)
        pts_array = np.empty((chunk_size, d), sample_v.dtype)
        # Enumerate the descriptors and div the index by the chunk size
        # (causes each loop to only deal with at most chunk_size descriptors at
        # a time).
        for k, g in groupby(enumerate(self._descriptor_set.iterdescriptors()),
                            lambda pair: pair[0] // chunk_size):
            # Items are still paired so extract the descriptors
            chunk = list(desc for (i, desc) in g)
            # Take care of dangling end piece
            k_beg = k * chunk_size
            k_end = min((k + 1) * chunk_size, n)
            k_len = k_end - k_beg
            # Run the descriptors through elements_to_matrix
            # - Using slicing on pts_array due to g being <= chunk-size on the
            #   last chunk.
            pts_array[:len(chunk)] = list(
                parallel_map(lambda d_: d_.vector(),
                             chunk,
                             use_multiprocessing=self._use_multiprocessing))
            # Insert into projection matrix
            projs[k_beg:k_end] = pts_array[:k_len].dot(random_bases)
        del pts_array

        LOG.debug("Constructing trees")
        desc_ids = list(self._descriptor_set.keys())
        # Start with no trees
        self._trees = []
        for t in range(self._num_trees):
            # Array of splits is a packed tree
            splits = np.empty(((1 << self._depth) - 1, ), np.float64)

            LOG.debug(f"Constructing tree #{t+1}")

            # Build the tree & store it
            leaves = self._build_single_tree(projs[:, t], splits)
            leaves_ids = [[desc_ids[idx] for idx in cast(Iterable[int], leaf)]
                          for leaf in leaves]
            self._trees.append(
                TreeElement(
                    **{
                        'random_basis': (random_bases[t]),
                        'splits': splits,
                        'leaves': leaves_ids,
                    }))