def iter_tocompute_arrays(): """ Yield descriptor vectors for classification elements that need computing yet. :rtype: typing.Generator[numpy.ndarray] """ # Force into an iterator. descr_iterator = iter(descr_iter) # Running var for the index of final data element in input # iterator. This will be -1 or the value of the final index in the # parallel lists. last_i = -1 # Make successive islices into iterator of descriptor elements to # produces batches. We end when there is nothing left being # returned by the iterator de_batch_list = \ list(itertools.islice(descr_iterator, d_elem_batch)) # Fully qualified path to this classifier implementation type. This # should be unique among concrete classifier implementations. self_name = f"{self.__module__}.{self.__class__.__name__}" while de_batch_list: # Get vectors from batch using implementation-level batch # aggregation methods where applicable. de_batch_vecs = \ DescriptorElement.get_many_vectors(de_batch_list) for d_elem, d_vec in zip(de_batch_list, de_batch_vecs): d_uid = d_elem.uuid() if d_vec is None: raise ValueError( "Encountered DescriptorElement with " "no vector stored! (UID=`{}`)".format(d_uid)) c_elem_ = factory.new_classification(self_name, d_uid) already_computed = \ not overwrite and c_elem_.has_classifications() elem_and_status_q.append((c_elem_, already_computed)) if not already_computed: # Classifications should be computed for this # descriptor log_debug( "Yielding descriptor array with UID `{}` " "for classification generation.".format(d_uid)) yield d_vec else: log_debug("Classification already generated for UID " "`{}`.".format(d_uid)) last_i += len(de_batch_vecs) # Slice out the next batch of descriptor elements. This will be # empty if the iterator has been exhausted. de_batch_list = list( itertools.islice(descr_iterator, d_elem_batch)) end_of_iter[0] = last_i
def get_many_vectors(self, uuids: Iterable[Hashable]) -> List[Optional[np.ndarray]]: """ Get underlying vectors of descriptors associated with given uuids. :param uuids: Iterable of descriptor UUIDs to query for. :raises: KeyError: When there is not a descriptor in this set for one or more input UIDs. :return: List of vectors for descriptors associated with given uuid values. """ return DescriptorElement.get_many_vectors( self.get_many_descriptors(uuids) )
def _train(self, class_examples, **extra_params): # convert descriptor elements into combines ndarray with associated # label vector. vec_list = [] label_list = [] for label, examples in class_examples.items(): label_vectors = \ DescriptorElement.get_many_vectors(examples) # ``is`` or ``count`` method messes up when elements are np arrays. none_count = len([e for e in label_vectors if e is None]) assert none_count == 0, \ "Some descriptor elements for label {} did not contain " \ "vectors! (n={})".format(label, none_count) vec_list.extend(label_vectors) label_list.extend([label] * len(label_vectors)) vec_list = np.vstack(vec_list) self.fit(vec_list, label_list)
def build_index(self, descriptors: Iterable[DescriptorElement]) -> None: # Cache given descriptor element vectors into a matrix for use during # ``rank``. descr_elem_list = list(descriptors) if len(descr_elem_list) == 0: raise ValueError("No descriptor elements passed.") # note: this fails if multiple descriptor elements with the same UID # are included. There will be None's present. descr_matrix = np.asarray( DescriptorElement.get_many_vectors(descr_elem_list)) # If the result matrix is of dtype(object), then either some elements # did not have vectors or some vectors were not of congruent # dimensionality. if descr_matrix.dtype == np.dtype(object): raise ValueError("One or more descriptor elements did not have a " "vector set or were of congruent dimensionality.") self._descr_elem_list = descr_elem_list self._descr_matrix = descr_matrix
def _descriptors_to_matrix( self, descriptors: List[DescriptorElement] ) -> Tuple[np.ndarray, Sequence[Hashable]]: """ Extract an (n,d) array with the descriptor vectors in each row, and a corresponding list of uuids from the list of descriptors. :param descriptors: List descriptor elements to add to this index. :return: An (n,d) array of descriptors (d-dim descriptors in n rows), and the corresponding list of descriptor uuids. """ new_uuids = [desc.uuid() for desc in descriptors] data = np.vstack( DescriptorElement.get_many_vectors(descriptors)).astype(np.float32) LOG.info(f"data shape, type: {data.shape}, {data.dtype}") LOG.info(f"# uuids: {len(new_uuids)}") return data, new_uuids
def build_index(self, descriptors: Iterable[DescriptorElement]) -> None: """ Build the index based on the given iterable of descriptor elements. Subsequent calls to this method should rebuild the index, not add to it. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.abc.Iterable[smqtk.representation.DescriptorElement] """ # ordered cache of descriptors in our index. self._descr_cache = [] # Reverse mapping of a descriptor's vector to its index in the cache # and subsequently in the distance kernel. self._descr2index = {} descriptors = list(descriptors) # matrix for creating distance kernel self._descr_matrix = numpy.array( DescriptorElement.get_many_vectors(descriptors)) vector_iter = zip(descriptors, self._descr_matrix) for i, (d, v) in enumerate(vector_iter): self._descr_cache.append(d) self._descr2index[tuple(v)] = i # TODO: (?) For when we optimize SVM SV kernel computation # self._dist_kernel = \ # compute_distance_kernel(self._descr_matrix, # histogram_intersection_distance2, # row_wise=True) if self.descr_cache_fp: with open(self.descr_cache_fp, 'wb') as f: pickle.dump(self._descr_cache, f, -1)
def _nn(self, d: DescriptorElement, n: int = 1 ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]: """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :param n: Number of nearest neighbors to find. :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. """ # Parent template method already assures there is a vector stored in # the input. d_vector = d.vector() assert d_vector is not None # Reshape into a 1xD vector with float32 type, which is required for # use with FAISS search. q = d_vector[np.newaxis, :].astype(np.float32) LOG.debug("Received query for %d nearest neighbors", n) with self._model_lock: if self._faiss_index is None: raise RuntimeError("No index currently available to remove " "from.") # Attempt to set n-probe of an IVF index self._set_index_nprobe() s_dists: np.ndarray s_ids: np.ndarray s_dists, s_ids = self._faiss_index.search( q, k=min(n, self._faiss_index.ntotal)) s_dists, s_ids = np.sqrt(s_dists[0, :]), s_ids[0, :] # Convert numpy.int64 type values into python integer values. # This is for compatibility when comparing values in some KVS # impls (postgres...). s_ids = s_ids.astype(object) # s_id (the FAISS index indices) can equal -1 if fewer than the # requested number of nearest neighbors is returned. In this case, # eliminate the -1 entries LOG.debug("Getting descriptor UIDs from idx2uid mapping.") uuids = list( self._idx2uid_kvs.get_many( cast(Iterator[Hashable], filter(lambda s_id_: s_id_ >= 0, s_ids)))) if len(uuids) < n: warnings.warn( f"Less than n={n} neighbors were retrieved from " "the FAISS index instance. Maybe increase " "nprobe if this is an IVF index?", RuntimeWarning) descriptors = tuple( self._descriptor_set.get_many_descriptors(uuids)) LOG.debug("Min and max FAISS distances: %g, %g", min(s_dists), max(s_dists)) d_vectors = np.vstack(DescriptorElement.get_many_vectors(descriptors)) d_dists = metrics.euclidean_distance(d_vectors, q) LOG.debug("Min and max descriptor distances: %g, %g", min(d_dists), max(d_dists)) order = d_dists.argsort() uuids, d_dists = zip(*((uuids[oidx], d_dists[oidx]) for oidx in order)) LOG.debug("Returning query result of size %g", len(uuids)) return descriptors, tuple(d_dists)
def _train(self, class_examples, **extra_params): # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting param_debug = {'-q': ''} if LOG.getEffectiveLevel() <= logging.DEBUG: param_debug = {} # Form libSVM problem input values LOG.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] # number of examples per class self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l LOG.debug('-- class %d (%s)', i, l) # requires a sequence, so making the iterable ``g`` a tuple g = class_examples[l] if not isinstance(g, collections.abc.Sequence): LOG.debug(' (expanding iterable into sequence)') g = tuple(g) train_group_sizes.append(float(len(g))) x = numpy.array(DescriptorElement.get_many_vectors(g)) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x assert len(train_labels) == len(train_vectors), \ "Count mismatch between parallel labels and descriptor vectors" \ "being sent to libSVM (%d != %d)" \ % (len(train_labels), len(train_vectors)) LOG.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Calculating class weights if set to C-SVC type SVM if '-s' not in params or int(params['-s']) == 0: # (john.moeller): The weighting should probably be the geometric # mean of the number of examples over the classes divided by the # number of examples for the current class. gmean = scipy.stats.gmean(train_group_sizes) for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): w = gmean / n params['-w' + str(i)] = w LOG.debug("-- class '%s' weight: %s", self.svm_label_map[i], w) LOG.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) LOG.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) del train_vectors LOG.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) LOG.debug("Training SVM model -- Done") if self.svm_label_map_elem and self.svm_label_map_elem.writable(): LOG.debug("saving labels to element (%s)", self.svm_label_map_elem) self.svm_label_map_elem.set_bytes( pickle.dumps(self.svm_label_map, -1)) if self.svm_model_elem and self.svm_model_elem.writable(): LOG.debug("saving model to element (%s)", self.svm_model_elem) # LibSvm I/O only works with filepaths, thus the need for an # intermediate temporary file. fd, fp = tempfile.mkstemp() try: svmutil.svm_save_model(fp, self.svm_model) # Use the file descriptor to create the file object. # This avoids reopening the file and will automatically # close the file descriptor on exiting the with block. # fdopen() is required because in Python 2 open() does # not accept a file descriptor. with os.fdopen(fd, 'rb') as f: self.svm_model_elem.set_bytes(f.read()) finally: os.remove(fp)