def fit(self, descriptors: Iterable[DescriptorElement], use_multiprocessing: bool = True) -> np.ndarray: """ Fit the ITQ model given the input set of descriptors :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :param use_multiprocessing: If multiprocessing should be used, as opposed to threading, for collecting descriptor vectors from the provided iterable. :raises RuntimeError: There is already a model loaded :return: Matrix hash code vectors (boolean-typed) for provided descriptors in order. """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None pr = None if LOG.getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds pr = ProgressReporter(LOG.debug, dbg_report_interval).start() if not hasattr(descriptors, "__len__"): LOG.info("Creating sequence from iterable") descriptors_l = [] for d in descriptors: descriptors_l.append(d) dbg_report_interval and pr.increment_report() # type: ignore dbg_report_interval and pr.report() # type: ignore descriptors = descriptors_l LOG.info("Creating matrix of descriptors for fitting") x = np.asarray( list( parallel_map(lambda d_: d_.vector(), descriptors, use_multiprocessing=use_multiprocessing))) LOG.debug("descriptor matrix shape: %s", x.shape) n, dim = x.shape LOG.debug("Generating random projections") np.random.seed(self.random_seed) self.rps = np.random.randn(dim, self.bit_length) LOG.debug( f"Info normalizing descriptors with norm type: {self.normalize}") return self.get_hash(x)
def perturb(self, ref_image: np.ndarray) -> np.ndarray: input_size = np.shape(ref_image)[:2] num_masks = self.n grid = self.grid s = self.s shift_rng = np.random.default_rng(self.seed) # Shape format: [H x W], Inherits from `input_size` cell_size = np.ceil(np.array(input_size) / s) up_size = (s + 1) * cell_size masks = np.empty((num_masks, *input_size), dtype=grid.dtype) # Expanding index accesses for repetition efficiency. cell_h, cell_w = cell_size[:2] input_h, input_w = input_size[:2] def work_func(i_: int) -> np.ndarray: # Random shifts y = shift_rng.integers(0, cell_h) x = shift_rng.integers(0, cell_w) mask = resize(grid[i_], up_size, order=1, mode='reflect', anti_aliasing=False)[y:y + input_h, x:x + input_w] return mask threads = self.threads if threads is None or threads < 1: for i in range(num_masks): masks[i, ...] = work_func(i) else: for i, m in enumerate( parallel_map( work_func, range(num_masks), cores=threads, use_multiprocessing=False, )): masks[i, ...] = m return masks
def perturb(self, ref_image: PIL.Image.Image) -> np.ndarray: input_size = (ref_image.height, ref_image.width) num_masks = self.n grid = self.grid s = self.s shift_rng = np.random.default_rng(self.seed) cell_size = np.ceil(np.array(input_size) / s) up_size = (s + 1) * cell_size masks = np.empty((num_masks, *input_size), dtype=grid.dtype) def work_func(i_: int) -> np.ndarray: # Random shifts x = shift_rng.integers(0, cell_size[0]) y = shift_rng.integers(0, cell_size[1]) mask = resize(grid[i_], up_size, order=1, mode='reflect', anti_aliasing=False)[x:x + input_size[0], y:y + input_size[1]] return mask threads = self.threads if threads is None or threads < 1: for i in range(num_masks): masks[i, ...] = work_func(i) else: for i, m in enumerate( parallel_map( work_func, range(num_masks), cores=threads, use_multiprocessing=False, )): masks[i, ...] = m return masks
def _generate_arrays( self, data_iter: Iterable[DataElement]) -> Iterable[numpy.ndarray]: """ Inner template method that defines the generation of descriptor vectors for a given iterable of data elements. Pre-conditions: - Data elements input to this method have been validated to be of at least one of this class's reported ``valid_content_types``. :param collections.abc.Iterable[DataElement] data_iter: Iterable of data element instances to be described. :raises RuntimeError: Descriptor extraction failure of some kind. :return: Iterable of numpy arrays in parallel association with the input data elements. :rtype: collections.abc.Iterable[numpy.ndarray] """ assert self.network is not None, ( "A network should be initialized by now.") self._set_caffe_mode() log_debug = LOG.debug # Start parallel operation to pre-process imagery before aggregating # for network execution. # TODO: update ``buffer_factor`` param to account for batch size? img_array_iter = \ parallel_map(_process_load_img_array, zip( data_iter, itertools.repeat(self.transformer), itertools.repeat(self.data_layer), itertools.repeat(self.load_truncated_images), itertools.repeat(self.pixel_rescale), ), ordered=True, cores=self.threads) # Aggregate and process batches of input data elements #: :type: list[numpy.ndarray] batch_img_arrays = \ list(itertools.islice(img_array_iter, self.batch_size)) batch_i = 0 while len(batch_img_arrays) > 0: cur_batch_size = len(batch_img_arrays) log_debug("Batch {} - size {}".format(batch_i, cur_batch_size)) log_debug("Updating network data layer shape ({} images)".format( cur_batch_size)) self.network.blobs[self.data_layer].reshape( cur_batch_size, *self.net_data_shape[1:4]) log_debug( "Loading image matrices into network layer '{:s}'".format( self.data_layer)) self.network.blobs[self.data_layer].data[...] = batch_img_arrays log_debug("Moving network forward") self.network.forward() descriptor_list = self.network.blobs[self.return_layer].data log_debug("extracting return layer '{:s}' into vectors".format( self.return_layer)) for v in descriptor_list: if v.ndim > 1: # In case caffe generates multidimensional array # like (rows, 1, 1) log_debug("- Raveling output array of shape {}".format( v.shape)) yield numpy.ravel(v) else: yield v # Slice out the next batch #: :type: list[(collections.abc.Hashable, numpy.ndarray)] batch_img_arrays = \ list(itertools.islice(img_array_iter, self.batch_size)) batch_i += 1
def fit(self, descriptors: Iterable[DescriptorElement], use_multiprocessing: bool = True) -> np.ndarray: """ Fit the ITQ model given the input set of descriptors. :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.abc.Iterable[smqtk.representation.DescriptorElement] :param use_multiprocessing: If multiprocessing should be used, as opposed to threading, when collecting descriptor elements from the given iterable. :type use_multiprocessing: bool :raises RuntimeError: There is already a model loaded :return: Matrix hash codes (boolean-valued) for provided descriptors in parallel order to input descriptors. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = 1.0 dbg_report = LOG.getEffectiveLevel() <= logging.DEBUG if not isinstance(descriptors, Sequence): LOG.info("Creating sequence from iterable") descriptors_l = [] pr = ProgressReporter(LOG.debug, dbg_report_interval).start() for d in descriptors: descriptors_l.append(d) dbg_report and pr.increment_report() # type: ignore dbg_report and pr.report() # type: ignore descriptors = descriptors_l if len(descriptors[0].vector()) < self.bit_length: raise ValueError("Input descriptors have fewer features than " "requested bit encoding. Hash codes will be " "smaller than requested due to PCA decomposition " "result being bound by number of features.") LOG.info("Creating matrix of descriptors for fitting") x = np.asarray( list( parallel_map(lambda d_: d_.vector(), descriptors, use_multiprocessing=use_multiprocessing))) LOG.debug("descriptor matrix shape: %s", x.shape) LOG.debug(f"Info normalizing descriptors by factor: {self.normalize}") x = self._norm_vector(x) LOG.info("Centering data") self.mean_vec = np.mean(x, axis=0) x -= self.mean_vec LOG.info("Computing PCA transformation") LOG.debug("-- computing covariance") # ``cov`` wants each row to be a feature and each column an observation # of those features. Thus, each column should be a descriptor vector, # thus we need the transpose here. c = np.cov(x.transpose()) # Direct translation from UNC matlab code # - eigen vectors are the columns of ``pc`` LOG.debug('-- computing linalg.eig') l, pc = np.linalg.eig(c) LOG.debug('-- ordering eigen vectors by descending eigen value') # # Harry translation of original matlab code # # - Uses singular values / vectors, not eigen # # - singular vectors are the columns of pc # LOG.debug('-- computing linalg.svd') # pc, l, _ = np.linalg.svd(c) # LOG.debug('-- ordering singular vectors by descending ' # 'singular value') # Same ordering method for both eig/svd sources. l_pc_ordered = sorted(zip(l, pc.transpose()), key=lambda _p: _p[0], reverse=True) LOG.debug("-- top vector extraction") # Only keep the top ``bit_length`` vectors after ordering by descending # value magnitude. # - Transposing vectors back to column-vectors. pc_top = np.array([p[1] for p in l_pc_ordered[:self.bit_length]])\ .transpose() LOG.debug("-- project centered data by PC matrix") v = np.dot(x, pc_top) LOG.info("Performing ITQ to find optimal rotation") c, self.rotation = self._find_itq_rotation(v, self.itq_iterations) # De-adjust rotation with PC vector self.rotation = np.dot(pc_top, self.rotation) self.save_model() return c
def _classify_arrays(self, array_iter): if not self.has_model(): raise RuntimeError("No SVM model present for classification") assert self.svm_model is not None, ( "Should have an SVM model at this point.") # Dump descriptors into a matrix for normalization and use in # prediction. vec_mat = numpy.array(list(array_iter)) vec_mat = self._norm_vector(vec_mat) n_jobs = self.n_jobs if n_jobs is not None: n_jobs = min(len(vec_mat), n_jobs) # Else: `n_jobs` is `None`, which is OK as it's the default value for # parallel_map. svm_label_map = self.svm_label_map c_base = dict((la, 0.) for la in svm_label_map.values()) # Effectively reproducing the body of svmutil.svm_predict in order to # simplify and get around excessive prints svm_type = self.svm_model.get_svm_type() nr_class = self.svm_model.get_nr_class() # Model internal labels. Parallel to ``prob_estimates`` array. svm_model_labels = self.svm_model.get_labels() # TODO: Normalize input arrays in batch(es). TEST if current norm # function can just take a matrix? if self.svm_model.is_probability_model(): # noinspection PyUnresolvedReferences if svm_type in [svm.NU_SVR, svm.EPSILON_SVR]: nr_class = 0 def single_pred(v): prob_estimates = (ctypes.c_double * nr_class)() v, idx = svm.gen_svm_nodearray(v.tolist()) svm.libsvm.svm_predict_probability(self.svm_model, v, prob_estimates) c = dict(c_base) # Shallow copy c.update({ svm_label_map[label]: prob for label, prob in zip(svm_model_labels, prob_estimates[:nr_class]) }) return c # If n_jobs == 1, just be serial if n_jobs == 1: return (single_pred(v) for v in vec_mat) else: return parallel_map(single_pred, vec_mat, cores=n_jobs, use_multiprocessing=True) else: # noinspection PyUnresolvedReferences if svm_type in (svm.ONE_CLASS, svm.EPSILON_SVR, svm.NU_SVC): nr_classifier = 1 else: nr_classifier = nr_class * (nr_class - 1) // 2 def single_label(v): dec_values = (ctypes.c_double * nr_classifier)() v, idx = svm.gen_svm_nodearray(v.tolist()) label = svm.libsvm.svm_predict_values(self.svm_model, v, dec_values) c = dict(c_base) # Shallow copy c[svm_label_map[label]] = 1. return c # If n_jobs == 1, just be serial if n_jobs == 1: return (single_label(v) for v in vec_mat) else: return parallel_map(single_label, vec_mat, cores=n_jobs, use_multiprocessing=True)
def _build_index(self, descriptors: Iterable[DescriptorElement]) -> None: """ Internal method to be implemented by sub-classes to build the index with the given descriptor data elements. Subsequent calls to this method should rebuild the current index. This method shall not add to the existing index nor raise an exception to as to protect the current index. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.abc.Iterable[smqtk.representation.DescriptorElement] """ with self._model_lock: # Not caring about restoring the index because we're just making a # new one. LOG.info("Building new FLANN index") LOG.debug("Caching descriptor elements") self._descr_cache = list(descriptors) # Cache descriptors if we have an element if self._descr_cache_elem and self._descr_cache_elem.writable(): LOG.debug(f"Caching descriptors: {self._descr_cache_elem}") self._descr_cache_elem.set_bytes( pickle.dumps(self._descr_cache, -1)) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if LOG.getEffectiveLevel() <= logging.DEBUG else "warning") } if self._build_autotune: params['algorithm'] = "autotuned" if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) LOG.debug("Accumulating descriptor vectors into matrix for FLANN") pts_array = numpy.asarray( list(parallel_map(lambda d_: d_.vector(), self._descr_cache))) LOG.debug('Building FLANN index') self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index( pts_array, **params) del pts_array if self._index_elem and self._index_elem.writable(): LOG.debug("Caching index: %s", self._index_elem) # FLANN wants to write to a file, so make a temp file, then # read it in, putting bytes into element. fd, fp = tempfile.mkstemp() try: self._flann.save_index(fp) # Use the file descriptor to create the file object. # This avoids reopening the file and will automatically # close the file descriptor on exiting the with block. # fdopen() is required because in Python 2 open() does # not accept a file descriptor. with os.fdopen(fd, 'rb') as f: self._index_elem.set_bytes(f.read()) finally: os.remove(fp) if self._index_param_elem and self._index_param_elem.writable(): LOG.debug(f"Caching index params: {self._index_param_elem}") state = { 'b_autotune': self._build_autotune, 'b_target_precision': self._build_target_precision, 'b_sample_frac': self._build_sample_frac, 'distance_method': self._distance_method, 'flann_build_params': self._flann_build_params, } self._index_param_elem.set_bytes(pickle.dumps(state, -1)) self._pid = multiprocessing.current_process().pid
def occlude_image_streaming( ref_image: np.ndarray, masks: Iterable[np.ndarray], fill: Optional[Union[int, Sequence[int], np.ndarray]] = None, threads: Optional[int] = None, ) -> Generator[np.ndarray, None, None]: """ Apply a number of input occlusion masks to the given reference image, producing a list of images equivalent in length, and parallel in order, to the input masks. We expect the "mask" matrices and the image to be the same height and width, and for the mask matrix values to be in the [0, 1] range. In the mask matrix, values closer to 1 correspond to regions of the image that should *NOT* be occluded. E.g. a 0 in the mask will translate to *fully* occluding the corresponding location in the source image. We optionally take in a "fill" that alpha-blend into masked regions of the input `ref_image`. `fill` may be either a scalar, sequence of scalars, or another image matrix congruent in shape to the `ref_image`. When `fill` is a scalar or a sequence of scalars, the scalars should be in the same data-type and value range as the input image. A sequence of scalars should be the same length as there are channels in the `ref_image`. When `fill` is an image matrix it should follow the format of `[H x W]` or `[H x W x C]`, should be in the same dtype and value range as `ref_image` and should match the same number of channels if channels are provided. When no fill is passed, black is used (default absence of color). Images output will mirror the input image format. As such, the `fill` value passed must be compatible with the input image channels for broadcasting. For example, a single channel input will not be able to be broadcast against a multi-channel `fill` input. A ValueError will be raised by the underlying numpy call in such cases. Assumptions: * Mask input is per-pixel. Does not accept per-channel masks. * Fill value input is in an applicable value range supported by the input image format, which is mirrored in output images. :param ref_image: Original base image :param masks: Mask images in the [N, Height, Weight] shape format. :param fill: Optional fill for alpha-blending based on the input masks for the occluded regions as a scalar value, a per-channel sequence or a shape-matched image. :param threads: Optional number of threads to use for parallelism when set to a positive integer. If 0, a negative value, or `None`, work will be performed on the main-thread in-line. :raises ValueError: One or more input masks in the input iterable did not match shape of the input reference image. :return: A generator of numpy array masked images. """ # Just the [H x W] component. img_shape = ref_image.shape[:2] s: Tuple = (..., ) if ref_image.ndim > 2: s = (..., None) # add channel axis for multiplication def work_func(i_: int, m: np.ndarray) -> np.ndarray: m_shape = m.shape if m_shape != img_shape: raise ValueError( f"Input mask (position {i_}) did not the shape of the input " f"image: {m_shape} != {img_shape}") img_m = np.empty_like(ref_image) if fill is not None: np.add((m[s] * ref_image), ((UINT8_ONE - m[s]) * fill), out=img_m, casting="unsafe") else: np.multiply(m[s], ref_image, out=img_m, casting="unsafe") return img_m if threads is None or threads < 1: for i, mask in enumerate(masks): yield work_func(i, mask) else: for img in parallel_map( work_func, itertools.count(), masks, cores=threads, use_multiprocessing=False, ): yield img
def _nn( self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :param n: Number of nearest neighbors to find. :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. """ LOG.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v: numpy.ndarray) -> float: return self._distance_function(d_v, d2_v) with self._model_lock: LOG.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = set(cast(Iterator[int], self.hash2uuids_kvstore.keys())) near_hashes, _ = hi.nn(d_h, n) LOG.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids: List[Hashable] = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it. # Get set of descriptor UUIDs for a hash code. near_uuids: Set[Hashable] = self.hash2uuids_kvstore.get(h_int, set()) # Accumulate matching descriptor UUIDs to a list. neighbor_uuids.extend(near_uuids) LOG.debug("-- matched %d UUIDs", len(neighbor_uuids)) LOG.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_set.get_many_descriptors(neighbor_uuids)) # Done with model parts at this point, so releasing lock. LOG.debug(f"ordering descriptors via distance method {self.distance_method}") LOG.debug('-- getting element vectors') neighbor_vectors = numpy.asarray(list( parallel_map(lambda d_: d_.vector(), neighbors) )) LOG.debug('-- calculating distances') distances = list(map(comp_descr_dist, neighbor_vectors)) LOG.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) LOG.debug(f'-- slicing top n={n}') r_descrs: Tuple[DescriptorElement, ...] r_dists: Tuple[float, ...] r_descrs, r_dists = zip(*(ordered[:n])) return r_descrs, r_dists
def _build_multiple_trees(self, chunk_size: int = CHUNK_SIZE) -> None: """ Build an MRPT structure """ sample = next(self._descriptor_set.iterdescriptors()) sample_v = sample.vector() if sample_v is None: raise RuntimeError( "Sample descriptor element from the current set had no vector " "content!") n = self.count() d = sample_v.size leaf_size = n / (1 << self._depth) nt = self._num_trees depth = self._depth LOG.debug(f"Building {nt} trees (T) of depth {depth} (l) " f"from {n:g} descriptors (N) of length {d:g}") LOG.debug( f"Leaf size (L = N/2^l) ~ {n:g}/2^{depth:d} = {leaf_size:g}" ) LOG.debug( f"UUIDs stored (T*N) = {nt:g} * {n:g} = {nt*n:g}") LOG.debug( f"Examined UUIDs (T*L) ~ {nt:g} * {leaf_size:g} = {nt*leaf_size:g}" ) LOG.debug( f"Examined/DB size (T*L/N = T/2^l) ~ {nt*leaf_size}/{n} = {nt*leaf_size/n:.3f}" ) if (1 << self._depth) > n: LOG.warning( f"There are insufficient elements ({n:d} < 2^{depth:d}) to " f"populate all the leaves of the tree. Consider lowering the " f"depth parameter.") LOG.debug("Projecting onto random bases") # Build all the random bases and the projections at the same time # (_num_trees * _depth shouldn't really be that high -- if it is, # you're a monster) rs = np.random.RandomState() if self._rand_seed is not None: rs.seed(self._rand_seed) random_bases = rs.randn(self._num_trees, d, self._depth) projs = np.empty((n, self._num_trees, self._depth), dtype=np.float64) # Load the data in chunks (because n * d IS high) pts_array = np.empty((chunk_size, d), sample_v.dtype) # Enumerate the descriptors and div the index by the chunk size # (causes each loop to only deal with at most chunk_size descriptors at # a time). for k, g in groupby(enumerate(self._descriptor_set.iterdescriptors()), lambda pair: pair[0] // chunk_size): # Items are still paired so extract the descriptors chunk = list(desc for (i, desc) in g) # Take care of dangling end piece k_beg = k * chunk_size k_end = min((k + 1) * chunk_size, n) k_len = k_end - k_beg # Run the descriptors through elements_to_matrix # - Using slicing on pts_array due to g being <= chunk-size on the # last chunk. pts_array[:len(chunk)] = list( parallel_map(lambda d_: d_.vector(), chunk, use_multiprocessing=self._use_multiprocessing)) # Insert into projection matrix projs[k_beg:k_end] = pts_array[:k_len].dot(random_bases) del pts_array LOG.debug("Constructing trees") desc_ids = list(self._descriptor_set.keys()) # Start with no trees self._trees = [] for t in range(self._num_trees): # Array of splits is a packed tree splits = np.empty(((1 << self._depth) - 1, ), np.float64) LOG.debug(f"Constructing tree #{t+1}") # Build the tree & store it leaves = self._build_single_tree(projs[:, t], splits) leaves_ids = [[desc_ids[idx] for idx in cast(Iterable[int], leaf)] for leaf in leaves] self._trees.append( TreeElement( **{ 'random_basis': (random_bases[t]), 'splits': splits, 'leaves': leaves_ids, }))