def add_descriptor(self, descriptor: DescriptorElement) -> None: """ Add a descriptor to this set. Adding the same descriptor multiple times should not add multiple copies of the descriptor in the set (based on UUID). Added descriptors overwrite set descriptors based on UUID. :param descriptor: Descriptor to set. """ if self.read_only: raise ReadOnlyError("Cannot clear a read-only set.") q = self.UPSERT_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, element_col=self.element_col, ) v = { 'uuid_val': str(descriptor.uuid()), 'element_val': psycopg2.Binary(pickle.dumps(descriptor, self.pickle_protocol)) } def exec_hook(cur: psycopg2.extensions.cursor) -> None: cur.execute(q, v) list(self.psql_helper.single_execute(exec_hook))
def _nn(self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :param n: Number of nearest neighbors to find. :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. """ with self._model_lock: self._restore_index() assert self._flann is not None, ( "We should have an index after restoration.") vec = d.vector() # If the distance method is HIK, we need to treat it special since # that method produces a similarity score, not a distance score. # # FLANN asserts that we query for <= index size, thus the use of # min(). idxs: numpy.ndarray dists: numpy.ndarray if self._distance_method == 'hik': # This call is different than the else version in that k is the # size of the full data set, so that we can reverse the # distances. idxs, dists = self._flann.nn_index(vec, len(self._descr_cache), **self._flann_build_params) else: idxs, dists = self._flann.nn_index( vec, min(n, len(self._descr_cache)), **self._flann_build_params) # When N>1, return value is a 2D array. Since this method limits # query to a single descriptor, we reduce to 1D arrays. if len(idxs.shape) > 1: idxs = idxs[0] dists = dists[0] if self._distance_method == 'hik': # Invert values to stay consistent with other distance value # norms. This also means that we reverse the "nearest" order # and reintroduce `n` size limit. # - This is intentionally happening *after* the "squeeze" op # above. dists = (1.0 - dists)[::-1][:n] idxs = idxs[::-1][:n] return tuple(self._descr_cache[i] for i in idxs), tuple(dists)
def iter_tocompute_arrays(): """ Yield descriptor vectors for classification elements that need computing yet. :rtype: typing.Generator[numpy.ndarray] """ # Force into an iterator. descr_iterator = iter(descr_iter) # Running var for the index of final data element in input # iterator. This will be -1 or the value of the final index in the # parallel lists. last_i = -1 # Make successive islices into iterator of descriptor elements to # produces batches. We end when there is nothing left being # returned by the iterator de_batch_list = \ list(itertools.islice(descr_iterator, d_elem_batch)) # Fully qualified path to this classifier implementation type. This # should be unique among concrete classifier implementations. self_name = f"{self.__module__}.{self.__class__.__name__}" while de_batch_list: # Get vectors from batch using implementation-level batch # aggregation methods where applicable. de_batch_vecs = \ DescriptorElement.get_many_vectors(de_batch_list) for d_elem, d_vec in zip(de_batch_list, de_batch_vecs): d_uid = d_elem.uuid() if d_vec is None: raise ValueError( "Encountered DescriptorElement with " "no vector stored! (UID=`{}`)".format(d_uid)) c_elem_ = factory.new_classification(self_name, d_uid) already_computed = \ not overwrite and c_elem_.has_classifications() elem_and_status_q.append((c_elem_, already_computed)) if not already_computed: # Classifications should be computed for this # descriptor log_debug( "Yielding descriptor array with UID `{}` " "for classification generation.".format(d_uid)) yield d_vec else: log_debug("Classification already generated for UID " "`{}`.".format(d_uid)) last_i += len(de_batch_vecs) # Slice out the next batch of descriptor elements. This will be # empty if the iterator has been exhausted. de_batch_list = list( itertools.islice(descr_iterator, d_elem_batch)) end_of_iter[0] = last_i
def _doc_for_code_descr(self, d: DescriptorElement) -> Dict[str, Any]: """ Generate standard identifying document base for the given descriptor element. """ uuid = d.uuid() return { 'id': '-'.join([self.set_uuid, str(uuid)]), self.set_uuid_field: self.set_uuid, self.d_uid_field: uuid, }
def _inner_add_descriptor( self, descriptor: DescriptorElement, no_cache: bool = False ) -> None: """ Internal adder with the additional option to trigger caching or not. :param descriptor: Descriptor to index. :param no_cache: Do not cache the internal table if a file cache was provided. This would be used if adding many descriptors at a time, preventing a file write for every individual descriptor added. """ self._table[descriptor.uuid()] = descriptor if not no_cache: self.cache_table()
def get_many_vectors(self, uuids: Iterable[Hashable]) -> List[Optional[np.ndarray]]: """ Get underlying vectors of descriptors associated with given uuids. :param uuids: Iterable of descriptor UUIDs to query for. :raises: KeyError: When there is not a descriptor in this set for one or more input UIDs. :return: List of vectors for descriptors associated with given uuid values. """ return DescriptorElement.get_many_vectors( self.get_many_descriptors(uuids) )
def _train(self, class_examples, **extra_params): # convert descriptor elements into combines ndarray with associated # label vector. vec_list = [] label_list = [] for label, examples in class_examples.items(): label_vectors = \ DescriptorElement.get_many_vectors(examples) # ``is`` or ``count`` method messes up when elements are np arrays. none_count = len([e for e in label_vectors if e is None]) assert none_count == 0, \ "Some descriptor elements for label {} did not contain " \ "vectors! (n={})".format(label, none_count) vec_list.extend(label_vectors) label_list.extend([label] * len(label_vectors)) vec_list = np.vstack(vec_list) self.fit(vec_list, label_list)
def build_index(self, descriptors: Iterable[DescriptorElement]) -> None: # Cache given descriptor element vectors into a matrix for use during # ``rank``. descr_elem_list = list(descriptors) if len(descr_elem_list) == 0: raise ValueError("No descriptor elements passed.") # note: this fails if multiple descriptor elements with the same UID # are included. There will be None's present. descr_matrix = np.asarray( DescriptorElement.get_many_vectors(descr_elem_list)) # If the result matrix is of dtype(object), then either some elements # did not have vectors or some vectors were not of congruent # dimensionality. if descr_matrix.dtype == np.dtype(object): raise ValueError("One or more descriptor elements did not have a " "vector set or were of congruent dimensionality.") self._descr_elem_list = descr_elem_list self._descr_matrix = descr_matrix
def _descriptors_to_matrix( self, descriptors: List[DescriptorElement] ) -> Tuple[np.ndarray, Sequence[Hashable]]: """ Extract an (n,d) array with the descriptor vectors in each row, and a corresponding list of uuids from the list of descriptors. :param descriptors: List descriptor elements to add to this index. :return: An (n,d) array of descriptors (d-dim descriptors in n rows), and the corresponding list of descriptor uuids. """ new_uuids = [desc.uuid() for desc in descriptors] data = np.vstack( DescriptorElement.get_many_vectors(descriptors)).astype(np.float32) LOG.info(f"data shape, type: {data.shape}, {data.dtype}") LOG.info(f"# uuids: {len(new_uuids)}") return data, new_uuids
def build_index(self, descriptors: Iterable[DescriptorElement]) -> None: """ Build the index based on the given iterable of descriptor elements. Subsequent calls to this method should rebuild the index, not add to it. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.abc.Iterable[smqtk.representation.DescriptorElement] """ # ordered cache of descriptors in our index. self._descr_cache = [] # Reverse mapping of a descriptor's vector to its index in the cache # and subsequently in the distance kernel. self._descr2index = {} descriptors = list(descriptors) # matrix for creating distance kernel self._descr_matrix = numpy.array( DescriptorElement.get_many_vectors(descriptors)) vector_iter = zip(descriptors, self._descr_matrix) for i, (d, v) in enumerate(vector_iter): self._descr_cache.append(d) self._descr2index[tuple(v)] = i # TODO: (?) For when we optimize SVM SV kernel computation # self._dist_kernel = \ # compute_distance_kernel(self._descr_matrix, # histogram_intersection_distance2, # row_wise=True) if self.descr_cache_fp: with open(self.descr_cache_fp, 'wb') as f: pickle.dump(self._descr_cache, f, -1)
def nn(self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: """ Return the nearest `N` neighbors to the given descriptor element. :raises ValueError: Input query descriptor ``d`` has no vector set. :raises ValueError: Current index is empty. :param d: Descriptor element to compute the neighbors of. :param n: Number of nearest neighbors to find. :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. """ if not d.has_vector(): raise ValueError("Query descriptor did not have a vector set!") elif not self.count(): raise ValueError("No index currently set to query from!") return self._nn(d, n)
def _nn(self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :param n: Number of nearest neighbors to find. :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. """ # Parent template method already assures there is a vector stored in # the input. d_vector = d.vector() assert d_vector is not None # Reshape into a 1xD vector with float32 type, which is required for # use with FAISS search. q = d_vector[np.newaxis, :].astype(np.float32) LOG.debug("Received query for %d nearest neighbors", n) with self._model_lock: if self._faiss_index is None: raise RuntimeError("No index currently available to remove " "from.") # Attempt to set n-probe of an IVF index self._set_index_nprobe() s_dists: np.ndarray s_ids: np.ndarray s_dists, s_ids = self._faiss_index.search( q, k=min(n, self._faiss_index.ntotal)) s_dists, s_ids = np.sqrt(s_dists[0, :]), s_ids[0, :] # Convert numpy.int64 type values into python integer values. # This is for compatibility when comparing values in some KVS # impls (postgres...). s_ids = s_ids.astype(object) # s_id (the FAISS index indices) can equal -1 if fewer than the # requested number of nearest neighbors is returned. In this case, # eliminate the -1 entries LOG.debug("Getting descriptor UIDs from idx2uid mapping.") uuids = list( self._idx2uid_kvs.get_many( cast(Iterator[Hashable], filter(lambda s_id_: s_id_ >= 0, s_ids)))) if len(uuids) < n: warnings.warn( f"Less than n={n} neighbors were retrieved from " "the FAISS index instance. Maybe increase " "nprobe if this is an IVF index?", RuntimeWarning) descriptors = tuple( self._descriptor_set.get_many_descriptors(uuids)) LOG.debug("Min and max FAISS distances: %g, %g", min(s_dists), max(s_dists)) d_vectors = np.vstack(DescriptorElement.get_many_vectors(descriptors)) d_dists = metrics.euclidean_distance(d_vectors, q) LOG.debug("Min and max descriptor distances: %g, %g", min(d_dists), max(d_dists)) order = d_dists.argsort() uuids, d_dists = zip(*((uuids[oidx], d_dists[oidx]) for oidx in order)) LOG.debug("Returning query result of size %g", len(uuids)) return descriptors, tuple(d_dists)
def __contains__(self, item: DescriptorElement) -> bool: if isinstance(item, DescriptorElement): # Testing for UUID inclusion since element hash based on UUID # value. return self.has_descriptor(item.uuid()) return False
def _train(self, class_examples, **extra_params): # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting param_debug = {'-q': ''} if LOG.getEffectiveLevel() <= logging.DEBUG: param_debug = {} # Form libSVM problem input values LOG.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] # number of examples per class self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l LOG.debug('-- class %d (%s)', i, l) # requires a sequence, so making the iterable ``g`` a tuple g = class_examples[l] if not isinstance(g, collections.abc.Sequence): LOG.debug(' (expanding iterable into sequence)') g = tuple(g) train_group_sizes.append(float(len(g))) x = numpy.array(DescriptorElement.get_many_vectors(g)) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x assert len(train_labels) == len(train_vectors), \ "Count mismatch between parallel labels and descriptor vectors" \ "being sent to libSVM (%d != %d)" \ % (len(train_labels), len(train_vectors)) LOG.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Calculating class weights if set to C-SVC type SVM if '-s' not in params or int(params['-s']) == 0: # (john.moeller): The weighting should probably be the geometric # mean of the number of examples over the classes divided by the # number of examples for the current class. gmean = scipy.stats.gmean(train_group_sizes) for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): w = gmean / n params['-w' + str(i)] = w LOG.debug("-- class '%s' weight: %s", self.svm_label_map[i], w) LOG.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) LOG.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) del train_vectors LOG.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) LOG.debug("Training SVM model -- Done") if self.svm_label_map_elem and self.svm_label_map_elem.writable(): LOG.debug("saving labels to element (%s)", self.svm_label_map_elem) self.svm_label_map_elem.set_bytes( pickle.dumps(self.svm_label_map, -1)) if self.svm_model_elem and self.svm_model_elem.writable(): LOG.debug("saving model to element (%s)", self.svm_model_elem) # LibSvm I/O only works with filepaths, thus the need for an # intermediate temporary file. fd, fp = tempfile.mkstemp() try: svmutil.svm_save_model(fp, self.svm_model) # Use the file descriptor to create the file object. # This avoids reopening the file and will automatically # close the file descriptor on exiting the with block. # fdopen() is required because in Python 2 open() does # not accept a file descriptor. with os.fdopen(fd, 'rb') as f: self.svm_model_elem.set_bytes(f.read()) finally: os.remove(fp)
def _nn( self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :param n: Number of nearest neighbors to find. :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. """ LOG.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v: numpy.ndarray) -> float: return self._distance_function(d_v, d2_v) with self._model_lock: LOG.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = set(cast(Iterator[int], self.hash2uuids_kvstore.keys())) near_hashes, _ = hi.nn(d_h, n) LOG.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids: List[Hashable] = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it. # Get set of descriptor UUIDs for a hash code. near_uuids: Set[Hashable] = self.hash2uuids_kvstore.get(h_int, set()) # Accumulate matching descriptor UUIDs to a list. neighbor_uuids.extend(near_uuids) LOG.debug("-- matched %d UUIDs", len(neighbor_uuids)) LOG.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_set.get_many_descriptors(neighbor_uuids)) # Done with model parts at this point, so releasing lock. LOG.debug(f"ordering descriptors via distance method {self.distance_method}") LOG.debug('-- getting element vectors') neighbor_vectors = numpy.asarray(list( parallel_map(lambda d_: d_.vector(), neighbors) )) LOG.debug('-- calculating distances') distances = list(map(comp_descr_dist, neighbor_vectors)) LOG.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) LOG.debug(f'-- slicing top n={n}') r_descrs: Tuple[DescriptorElement, ...] r_dists: Tuple[float, ...] r_descrs, r_dists = zip(*(ordered[:n])) return r_descrs, r_dists
def _nn(self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: # Parent template method already checks that `d` has a non-None vector d_v = d.vector() def _query_single(tree: TreeElement) -> List[Hashable]: # Search a single tree for the leaf that matches the query # NB: random_basis has shape (levels, N) random_basis = tree.random_basis assert d_v is not None proj_query = d_v.dot(random_basis) splits = tree.splits idx = 0 for level in range(depth): split_point = splits[idx] # Look at the level'th coordinate of proj_query if proj_query[level] < split_point: idx = 2 * idx + 1 else: idx = 2 * idx + 2 # idx will be `2^depth - 1` greater than the position of the leaf # in the list idx -= ((1 << depth) - 1) return tree.leaves[idx] def _exact_query( _uuids: Sequence[Hashable] ) -> Tuple[Sequence[Hashable], np.ndarray]: set_size = len(_uuids) LOG.debug(f"Exact query requested with {set_size} descriptors") # Assemble the array to query from the descriptors that match assert d_v is not None pts_array = np.empty((set_size, d_v.size), dtype=d_v.dtype) descriptors = self._descriptor_set.get_many_descriptors(_uuids) for i, desc in enumerate(descriptors): pts_array[i, :] = desc.vector() dists: np.ndarray = ((pts_array - d_v)**2).sum(axis=1) if n > dists.shape[0]: LOG.warning( f"There were fewer descriptors ({dists.shape[0]}) in the " f"set than requested in the query ({n}). Returning entire " f"set.") if n >= dists.shape[0]: return _uuids, dists near_indices = np.argpartition(dists, n - 1)[:n] return ([_uuids[idx] for idx in near_indices], dists[near_indices]) with self._model_lock: LOG.debug(f"Received query for {n} nearest neighbors") depth, ntrees, db_size = self._depth, self._num_trees, self.count() leaf_size = db_size // (1 << depth) if leaf_size * ntrees < n: LOG.warning( f"The number of descriptors in a leaf ({leaf_size}) times " f"the number of trees ({ntrees}) is less than the number " f"of descriptors requested by the query ({n}). The query " f"result will be deficient.") # Take union of all tree hits tree_hits: Set[Hashable] = set() for t in self._trees: tree_hits.update(_query_single(t)) hit_union = len(tree_hits) LOG.debug( f"Query (k): {n}, Hit union (h): {hit_union}, " f"DB (N): {db_size}, Leaf size (L = N/2^l): {leaf_size}, " f"Examined (T*L): {leaf_size * ntrees}") LOG.debug(f"k/L = {n / leaf_size:.3f}") LOG.debug(f"h/N = {hit_union / db_size:.3f}") LOG.debug(f"h/L = {hit_union / leaf_size:.3f}") LOG.debug(f"h/(T*L) = {hit_union / (leaf_size * ntrees):.3f}") uuids, distances = _exact_query(list(tree_hits)) order = distances.argsort() uuids, distances = zip(*((uuids[oidx], distances[oidx]) for oidx in order)) LOG.debug(f"Returning query result of size {len(uuids)}") return (tuple(self._descriptor_set.get_many_descriptors(uuids)), tuple(distances))