def _descriptors_to_matrix(self, descriptors): """ Extract an (n,d) array with the descriptor vectors in each row, and a corresponding list of uuids from the list of descriptors. :param descriptors: List descriptor elements to add to this index. :type descriptors: list[smqtk.representation.DescriptorElement] :return: An (n,d) array of descriptors (d-dim descriptors in n rows), and the corresponding list of descriptor uuids. :rtype: (np.ndarray, list[collections.Hashable]) """ new_uuids = [desc.uuid() for desc in descriptors] sample_v = descriptors[0].vector() n, d = len(new_uuids), sample_v.size data = np.empty((n, d), dtype=np.float32) elements_to_matrix( descriptors, mat=data, use_multiprocessing=self.use_multiprocessing, report_interval=1.0, ) self._log.info("data shape, type: %s, %s", data.shape, data.dtype) self._log.info("# uuids: %d", n) return data, new_uuids
def _build_faiss_model(self): sample = next(self._descriptor_set.iterdescriptors()) sample_v = sample.vector() n, d = self.count(), sample_v.size data = np.empty((n, d), dtype=np.float32) elements_to_matrix( self._descriptor_set, mat=data, use_multiprocessing=self.use_multiprocessing, report_interval=1.0, ) self._uuids = np.array(list(self._descriptor_set.keys())) self.faiss_flat = faiss.IndexFlatL2(d) if self.exhaustive: self._faiss_index = faiss.IndexIDMap(self.faiss_flat) else: nlist = 10000 self._faiss_index = faiss.IndexIVFFlat(self.faiss_flat, d, nlist, faiss.METRIC_L2) self._faiss_index.train(data) self._faiss_index.nprobe = 5000 self._log.info("data shape, type: %s, %s", data.shape, data.dtype) self._log.info("uuid shape, type: %s, %s", self._uuids.shape, self._uuids.dtype) self._faiss_index.add_with_ids(data, self._uuids) self._log.info("FAISS index has been constructed with %d vectors", self._faiss_index.ntotal)
def nn(self, d, n=1): super(FaissNearestNeighborsIndex, self).nn(d, n) q = d.vector().reshape(1, -1).astype(np.float32) self._log.debug("Received query for %d nearest neighbors", n) dists, ids = self._faiss_index.search(q, n) dists, ids = np.sqrt(dists).squeeze(), ids.squeeze() uuids = ids descriptors = tuple(self._descriptor_set.get_many_descriptors(uuids)) d_vectors = elements_to_matrix(descriptors) d_dists = np.sqrt(((d_vectors - q)**2).sum(axis=1)) order = dists.argsort() uuids, dists = list( zip(*((uuids[oidx], d_dists[oidx]) for oidx in order))) d_dists = d_dists[order] self._log.debug("Min and max FAISS distances: %g, %g", min(dists), max(dists)) self._log.debug("Min and max descriptor distances: %g, %g", min(d_dists), max(d_dists)) self._log.debug("Returning query result of size %g", len(uuids)) return (descriptors, tuple(dists))
def nn(self, d, n=1): """ Return the nearest `N` neighbors to the given descriptor element. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ d_vec, _, d_sc = self.get_small_code(d) # Extract the `n` nearest codes to the code of the query descriptor # - a code may associate with multiple hits, but its a safe assumption # that if we get the top `n` codes, which exist because there is at # least one element in association with it, self._log.debug("fetching nearest %d codes", n) code_set = self._code_index.codes() # TODO: Optimize this step #: :type: list[int] near_codes = \ heapq.nsmallest(n, code_set, lambda e: distance_functions.hamming_distance(d_sc, e) ) # Collect descriptors from subsequently farther away bins until we have # >= `n` descriptors, which we will more finely sort after this. #: :type: list[smqtk.representation.DescriptorElement] self._log.debug("Collecting descriptors from near codes") neighbors = [] termination_count = min(n, self.count()) for nc in near_codes: neighbors.extend(self._code_index.get_descriptors(nc)) # Break out if we've collected >= `n` descriptors, as descriptors # from more distance codes are likely to not be any closer. if len(neighbors) >= termination_count: break # Compute fine-grain distance measurements for collected elements + sort self._log.debug("elements to numpy") neighbor_vectors = elements_to_matrix(neighbors, use_multiprocessing=False, report_interval=1) self._log.debug("Sorting descriptors: %d", len(neighbors)) def comp_neighbor_dist(neighbor_vec): return self._dist_func(d_vec, neighbor_vec) distances = map(comp_neighbor_dist, neighbor_vectors) # Sort by distance, return top n self._log.debug("Forming output") ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) neighbors, distances = zip(*(ordered[:n])) return neighbors, distances
def nn(self, d, n=1): """ Return the nearest `N` neighbors to the given descriptor element. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ super(LSHNearestNeighborIndex, self).nn(d, n) self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) self._log.debug("getting near hashes") hi = self.hash_index # Make on-the-fly linear index if we weren't originally set with one if hi is None: hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. with self._hash2uuid_lock: hi.index = numpy.array(self._hash2uuid.keys()) hashes, hash_dists = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] with self._hash2uuid_lock: for h_int in map(bit_vector_to_int_large, hashes): # If descriptor hash not in our map, we effectively skip it neighbor_uuids.extend(self._hash2uuid.get(h_int, ())) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = map(comp_descr_dist, neighbor_vectors) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return zip(*(ordered[:n]))
def nn(self, d, n=1): """ Return the nearest `N` neighbors to the given descriptor element. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ super(LSHNearestNeighborIndex, self).nn(d, n) self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) self._log.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = numpy.array(list(self.hash2uuids_kvstore.keys())) near_hashes, _ = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it #: :type: collections.Iterable near_uuids = self.hash2uuids_kvstore.get(h_int, ()) neighbor_uuids.extend(near_uuids) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = map(comp_descr_dist, neighbor_vectors) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return zip(*(ordered[:n]))
def fit(self, descriptors, use_multiprocessing=True): """ Fit the ITQ model given the input set of descriptors :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :param use_multiprocessing: If multiprocessing should be used, as opposed to threading, for collecting descriptor vectors from the provided iterable. :type use_multiprocessing: bool :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None if self.get_logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds if not hasattr(descriptors, "__len__"): self._log.info("Creating sequence from iterable") descriptors_l = [] pr = ProgressReporter(self._log.debug, dbg_report_interval).start() for d in descriptors: descriptors_l.append(d) dbg_report_interval and pr.increment_report() dbg_report_interval and pr.report() descriptors = descriptors_l self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix(descriptors, report_interval=dbg_report_interval, use_multiprocessing=use_multiprocessing) self._log.debug("descriptor matrix shape: %s", x.shape) n, dim = x.shape self._log.debug("Generating random projections") np.random.seed(self.random_seed) self.rps = np.random.randn(dim, self.bit_length) self._log.debug("Info normalizing descriptors with norm type: %s", self.normalize) return self.get_hash(x)
def fit(self, descriptors, use_multiprocessing=True): """ Fit the ITQ model given the input set of descriptors :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :param use_multiprocessing: If multiprocessing should be used, as opposed to threading, for collecting descriptor vectors from the provided iterable. :type use_multiprocessing: bool :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None if self.get_logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds if not hasattr(descriptors, "__len__"): self._log.info("Creating sequence from iterable") descriptors_l = [] rs = [0]*7 for d in descriptors: descriptors_l.append(d) report_progress(self._log.debug, rs, dbg_report_interval) descriptors = descriptors_l self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix( descriptors, report_interval=dbg_report_interval, use_multiprocessing=use_multiprocessing) self._log.debug("descriptor matrix shape: %s", x.shape) n, dim = x.shape self._log.debug("Generating random projections") np.random.seed(self.random_seed) self.rps = np.random.randn(dim, self.bit_length) self._log.debug("Info normalizing descriptors with norm type: %s", self.normalize) return self.get_hash(x)
def _nn(self, d, n=1): """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ q = d.vector()[np.newaxis, :].astype(np.float32) self._log.debug("Received query for %d nearest neighbors", n) with self._model_lock: s_dists, s_ids = self._faiss_index.search(q, n) s_dists, s_ids = np.sqrt(s_dists[0, :]), s_ids[0, :] uuids = [self._idx2uid_kvs[s_id] for s_id in s_ids] descriptors = self._descriptor_set.get_many_descriptors(uuids) self._log.debug("Min and max FAISS distances: %g, %g", min(s_dists), max(s_dists)) descriptors = tuple(descriptors) d_vectors = elements_to_matrix(descriptors) d_dists = metrics.euclidean_distance(d_vectors, q) self._log.debug("Min and max descriptor distances: %g, %g", min(d_dists), max(d_dists)) order = d_dists.argsort() uuids, d_dists = zip(*((uuids[oidx], d_dists[oidx]) for oidx in order)) self._log.debug("Returning query result of size %g", len(uuids)) return descriptors, tuple(d_dists)
def _nn(self, d, n=1): """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) with self._model_lock: self._log.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = numpy.array(list(self.hash2uuids_kvstore.keys())) near_hashes, _ = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it. # Get set of descriptor UUIDs for a hash code. #: :type: set[collections.Hashable] near_uuids = self.hash2uuids_kvstore.get(h_int, set()) # Accumulate matching descriptor UUIDs to a list. neighbor_uuids.extend(near_uuids) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) # Done with model parts at this point, so releasing lock. self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = list(map(comp_descr_dist, neighbor_vectors)) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return list(zip(*(ordered[:n])))
def fit(self, descriptors): """ Fit the ITQ model given the input set of descriptors :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None if self.logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds if not hasattr(descriptors, "__len__"): self._log.info("Creating sequence from iterable") descriptors_l = [] rs = [0]*7 for d in descriptors: descriptors_l.append(d) report_progress(self._log.debug, rs, dbg_report_interval) descriptors = descriptors_l self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix(descriptors, report_interval=dbg_report_interval) self._log.debug("descriptor matrix shape: %s", x.shape) self._log.debug("Info normalizing descriptors by factor: %s", self.normalize) x = self._norm_vector(x) self._log.info("Centering data") self.mean_vec = numpy.mean(x, axis=0) x -= self.mean_vec self._log.info("Computing PCA transformation") # numpy and matlab observation format is flipped, thus the added # transpose. self._log.debug("-- computing covariance") c = numpy.cov(x.transpose()) # Direct translation from UNC matlab code # - eigen vectors are the columns of ``pc`` self._log.debug('-- computing linalg.eig') l, pc = numpy.linalg.eig(c) # ordered by greatest eigenvalue magnitude, keeping top ``bit_len`` self._log.debug('-- computing top pairs') top_pairs = sorted(zip(l, pc.transpose()), key=lambda p: p[0], reverse=1 )[:self.bit_length] # # Harry translation -- Uses singular values / vectors, not eigen # # - singular vectors are the rows of pc # pc, l, _ = numpy.linalg.svd(c) # top_pairs = sorted(zip(l, pc), # key=lambda p: p[0], # reverse=1 # )[:self.bit_length] # Eigen-vectors of top ``bit_len`` magnitude eigenvalues self._log.debug("-- top vector extraction") pc_top = numpy.array([p[1] for p in top_pairs]).transpose() self._log.debug("-- transform centered data by PC matrix") xx = numpy.dot(x, pc_top) self._log.info("Performing ITQ to find optimal rotation") c, self.rotation = self._find_itq_rotation(xx, self.itq_iterations) # De-adjust rotation with PC vector self.rotation = numpy.dot(pc_top, self.rotation) self.save_model() return c
def build_index(self, descriptors): """ Build the index over the descriptors data elements. Subsequent calls to this method should rebuild the index, not add to it. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptors elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # Not caring about restoring the index because we're just making a new # one self._log.info("Building new FLANN index") self._log.debug("Storing descriptors") self._descr_cache = list(descriptors) if not self._descr_cache: raise ValueError("No data provided in given iterable.") # Cache descriptors if we have a path if self._descr_cache_filepath: self._log.debug("Caching descriptors: %s", self._descr_cache_filepath) safe_create_dir(osp.dirname(self._descr_cache_filepath)) with open(self._descr_cache_filepath, 'wb') as f: cPickle.dump(self._descr_cache, f, -1) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning") } if self._build_autotune: params['algorithm'] = "autotuned" if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._log.debug("Accumulating descriptor vectors into matrix for FLANN") pts_array = elements_to_matrix(self._descr_cache, report_interval=1.0) self._log.debug('Building FLANN index') self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index(pts_array, **params) del pts_array self._log.debug("Caching index and state: %s, %s", self._index_filepath, self._index_param_filepath) if self._index_filepath: self._log.debug("Caching index: %s", self._index_filepath) safe_create_dir(osp.dirname(self._index_filepath)) self._flann.save_index(self._index_filepath) if self._index_param_filepath: self._log.debug("Caching index params: %s", self._index_param_filepath) state = { 'b_autotune': self._build_autotune, 'b_target_precision': self._build_target_precision, 'b_sample_frac': self._build_sample_frac, 'distance_method': self._distance_method, 'flann_build_params': self._flann_build_params, } safe_create_dir(osp.dirname(self._index_param_filepath)) with open(self._index_param_filepath, 'w') as f: cPickle.dump(state, f, -1) self._pid = multiprocessing.current_process().pid
def train(self, positive_classes, negatives): """ Train the supervised SVM classifier model. The class label ``negative`` is reserved for the negative class. If a model is already loaded, we will raise an exception in order to prevent accidental overwrite. NOTE: This abstract method provides generalized error checking and should be called via ``super`` in implementing methods. :param positive_classes: Dictionary mapping positive class labels to iterables of DescriptorElement training examples. :type positive_classes: dict[collections.Hashable, collections.Iterable[smqtk.representation.DescriptorElement]] :param negatives: Iterable of negative DescriptorElement examples. :type negatives: collections.Iterable[smqtk.representation.DescriptorElement] :raises ValueError: The ``negative`` label was found in the ``positive_classes`` dictionary. This is reserved for the negative example class. :raises ValueError: There were no positive or negative examples. :raises RuntimeError: A model already exists in this instance.Following through with training would overwrite this model. Throwing an exception for information protection. """ super(LibSvmClassifier, self).train(positive_classes, negatives) # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting etm_ri = None param_debug = {"-q": ""} if self._log.getEffectiveLevel() <= logging.DEBUG: etm_ri = 1.0 param_debug = {} # Form libSVM problem input values self._log.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(positive_classes), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l self._log.debug("-- class %d (%s)", i, l) # requires a sequence, so making the iterable ``g`` a tuple g = positive_classes[l] if not isinstance(g, collections.Sequence): g = tuple(g) train_group_sizes.append(float(len(g))) x = elements_to_matrix(g, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x self._log.debug("-- negatives (-1)") # Map integer SVM label to semantic label self.svm_label_map[-1] = self.NEGATIVE_LABEL # requires a sequence, so making the iterable ``negatives`` a tuple if not isinstance(negatives, collections.Sequence): negatives = tuple(negatives) negatives_size = float(len(negatives)) x = elements_to_matrix(negatives, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([-1] * x.shape[0]) train_vectors.extend(x.tolist()) del negatives, x self._log.debug( "Training elements: %d labels, %d vectors " "(should be the same)", len(train_labels), len(train_vectors) ) self._log.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Only need to calculate positive class weights when C-SVC type if "-s" not in params or int(params["-s"]) == 0: for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): params["-w" + str(i)] = max(1.0, negatives_size / float(n)) self._log.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) self._log.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) self._log.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) self._log.debug("Training SVM model -- Done") if self.svm_label_map_fp: self._log.debug("saving file -- labels -- %s", self.svm_label_map_fp) with open(self.svm_label_map_fp, "wb") as f: cPickle.dump(self.svm_label_map, f) if self.svm_model_fp: self._log.debug("saving file -- model -- %s", self.svm_model_fp) svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
def train(self, class_examples=None, **kwds): """ Train the supervised classifier model. If a model is already loaded, we will raise an exception in order to prevent accidental overwrite. If the same label is provided to both ``class_examples`` and ``kwds``, the examples given to the reference in ``kwds`` will prevail. :param class_examples: Dictionary mapping class labels to iterables of DescriptorElement training examples. :type class_examples: dict[collections.Hashable, collections.Iterable[smqtk.representation.DescriptorElement]] :param kwds: Keyword assignment of labels to iterables of DescriptorElement training examples. :type kwds: dict[str, collections.Iterable[smqtk.representation.DescriptorElement]] :raises ValueError: There were no class examples provided. :raises ValueError: Less than 2 classes were given. :raises RuntimeError: A model already exists in this instance.Following through with training would overwrite this model. Throwing an exception for information protection. """ class_examples = \ super(LibSvmClassifier, self).train(class_examples, **kwds) # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting etm_ri = None param_debug = {'-q': ''} if self._log.getEffectiveLevel() <= logging.DEBUG: etm_ri = 1.0 param_debug = {} # Form libSVM problem input values self._log.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] # number of examples per class self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l self._log.debug('-- class %d (%s)', i, l) # requires a sequence, so making the iterable ``g`` a tuple g = class_examples[l] if not isinstance(g, collections.Sequence): g = tuple(g) train_group_sizes.append(float(len(g))) x = elements_to_matrix(g, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x assert len(train_labels) == len(train_vectors), \ "Count miss-match between parallel labels and descriptor vectors" \ "being sent to libSVM (%d != %d)" \ % (len(train_labels), len(train_vectors)) self._log.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Calculating class weights for C-SVC SVM if '-s' not in params or int(params['-s']) == 0: total_examples = sum(train_group_sizes) for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): # weight is the ratio of between number of other-class examples # to the number of examples in this class. other_class_examples = total_examples - n w = max(1.0, other_class_examples / float(n)) params['-w' + str(i)] = w self._log.debug("-- class '%s' weight: %s", self.svm_label_map[i], w) self._log.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) self._log.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) self._log.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) self._log.debug("Training SVM model -- Done") if self.svm_label_map_fp: self._log.debug("saving file -- labels -- %s", self.svm_label_map_fp) with open(self.svm_label_map_fp, 'wb') as f: cPickle.dump(self.svm_label_map, f, -1) if self.svm_model_fp: self._log.debug("saving file -- model -- %s", self.svm_model_fp) svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
def _train(self, class_examples, **extra_params): """ Internal method that trains the classifier implementation. This method is called after checking that there is not already a model trained, thus it can be assumed that no model currently exists. The class labels will have already been checked before entering this method, so it can be assumed that the ``class_examples`` will container at least two classes. :param class_examples: Dictionary mapping class labels to iterables of DescriptorElement training examples. :type class_examples: dict[collections.Hashable, collections.Iterable[smqtk.representation.DescriptorElement]] :param extra_params: Dictionary with extra parameters for training. This is not used by this implementation. :type extra_params: None | dict[basestring, object] """ # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting etm_ri = None param_debug = {'-q': ''} if self._log.getEffectiveLevel() <= logging.DEBUG: etm_ri = 1.0 param_debug = {} # Form libSVM problem input values self._log.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] # number of examples per class self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l self._log.debug('-- class %d (%s)', i, l) # requires a sequence, so making the iterable ``g`` a tuple g = class_examples[l] if not isinstance(g, collections.Sequence): self._log.debug(' (expanding iterable into sequence)') g = tuple(g) train_group_sizes.append(float(len(g))) x = elements_to_matrix(g, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x assert len(train_labels) == len(train_vectors), \ "Count mismatch between parallel labels and descriptor vectors" \ "being sent to libSVM (%d != %d)" \ % (len(train_labels), len(train_vectors)) self._log.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Calculating class weights if set to C-SVC type SVM if '-s' not in params or int(params['-s']) == 0: # (john.moeller): The weighting should probably be the geometric # mean of the number of examples over the classes divided by the # number of examples for the current class. gmean = scipy.stats.gmean(train_group_sizes) for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): w = gmean / n params['-w' + str(i)] = w self._log.debug("-- class '%s' weight: %s", self.svm_label_map[i], w) self._log.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) self._log.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) del train_vectors self._log.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) self._log.debug("Training SVM model -- Done") if self.svm_label_map_elem and self.svm_label_map_elem.writable(): self._log.debug("saving labels to element (%s)", self.svm_label_map_elem) self.svm_label_map_elem.set_bytes( cPickle.dumps(self.svm_label_map, -1) ) if self.svm_model_elem and self.svm_model_elem.writable(): self._log.debug("saving model to element (%s)", self.svm_model_elem) # LibSvm I/O only works with filepaths, thus the need for an # intermediate temporary file. fd, fp = tempfile.mkstemp() try: svmutil.svm_save_model(fp, self.svm_model) # Use the file descriptor to create the file object. # This avoids reopening the file and will automatically # close the file descriptor on exiting the with block. # fdopen() is required because in Python 2 open() does # not accept a file descriptor. with os.fdopen(fd, 'rb') as f: self.svm_model_elem.set_bytes(f.read()) finally: os.remove(fp)
def fit(self, descriptors, use_multiprocessing=True): """ Fit the ITQ model given the input set of descriptors. :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :param use_multiprocessing: If multiprocessing should be used, as opposed to threading, when collecting descriptor elements from the given iterable. :type use_multiprocessing: bool :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = 1.0 dbg_report = self.get_logger().getEffectiveLevel() <= logging.DEBUG if not isinstance(descriptors, Sequence): self._log.info("Creating sequence from iterable") descriptors_l = [] pr = ProgressReporter(self._log.debug, dbg_report_interval).start() for d in descriptors: descriptors_l.append(d) dbg_report and pr.increment_report() dbg_report and pr.report() descriptors = descriptors_l if len(descriptors[0].vector()) < self.bit_length: raise ValueError("Input descriptors have fewer features than " "requested bit encoding. Hash codes will be " "smaller than requested due to PCA decomposition " "result being bound by number of features.") self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix(descriptors, report_interval=dbg_report_interval, use_multiprocessing=use_multiprocessing) self._log.debug("descriptor matrix shape: %s", x.shape) self._log.debug("Info normalizing descriptors by factor: %s", self.normalize) x = self._norm_vector(x) self._log.info("Centering data") self.mean_vec = numpy.mean(x, axis=0) x -= self.mean_vec self._log.info("Computing PCA transformation") self._log.debug("-- computing covariance") # ``cov`` wants each row to be a feature and each column an observation # of those features. Thus, each column should be a descriptor vector, # thus we need the transpose here. c = numpy.cov(x.transpose()) if True: # Direct translation from UNC matlab code # - eigen vectors are the columns of ``pc`` self._log.debug('-- computing linalg.eig') l, pc = numpy.linalg.eig(c) self._log.debug('-- ordering eigen vectors by descending eigen ' 'value') else: # Harry translation -- Uses singular values / vectors, not eigen # - singular vectors are the columns of pc self._log.debug('-- computing linalg.svd') pc, l, _ = numpy.linalg.svd(c) self._log.debug('-- ordering singular vectors by descending ' 'singular value') # Same ordering method for both eig/svd sources. l_pc_ordered = sorted(zip(l, pc.transpose()), key=lambda _p: _p[0], reverse=True) self._log.debug("-- top vector extraction") # Only keep the top ``bit_length`` vectors after ordering by descending # value magnitude. # - Transposing vectors back to column-vectors. pc_top = numpy.array([p[1] for p in l_pc_ordered[:self.bit_length]])\ .transpose() self._log.debug("-- project centered data by PC matrix") v = numpy.dot(x, pc_top) self._log.info("Performing ITQ to find optimal rotation") c, self.rotation = self._find_itq_rotation(v, self.itq_iterations) # De-adjust rotation with PC vector self.rotation = numpy.dot(pc_top, self.rotation) self.save_model() return c
def _build_index(self, descriptors): """ Internal method to be implemented by sub-classes to build the index with the given descriptor data elements. Subsequent calls to this method should rebuild the current index. This method shall not add to the existing index nor raise an exception to as to protect the current index. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ with self._model_lock: # Not caring about restoring the index because we're just making a # new one. self._log.info("Building new FLANN index") self._log.debug("Caching descriptor elements") self._descr_cache = list(descriptors) # Cache descriptors if we have an element if self._descr_cache_elem and self._descr_cache_elem.writable(): self._log.debug("Caching descriptors: %s", self._descr_cache_elem) self._descr_cache_elem.set_bytes( cPickle.dumps(self._descr_cache, -1)) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning") } if self._build_autotune: params['algorithm'] = "autotuned" if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._log.debug("Accumulating descriptor vectors into matrix for " "FLANN") pts_array = elements_to_matrix(self._descr_cache, report_interval=1.0) self._log.debug('Building FLANN index') self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index( pts_array, **params) del pts_array if self._index_elem and self._index_elem.writable(): self._log.debug("Caching index: %s", self._index_elem) # FLANN wants to write to a file, so make a temp file, then # read it in, putting bytes into element. fd, fp = tempfile.mkstemp() try: self._flann.save_index(fp) # Use the file descriptor to create the file object. # This avoids reopening the file and will automatically # close the file descriptor on exiting the with block. # fdopen() is required because in Python 2 open() does # not accept a file descriptor. with os.fdopen(fd, 'rb') as f: self._index_elem.set_bytes(f.read()) finally: os.remove(fp) if self._index_param_elem and self._index_param_elem.writable(): self._log.debug("Caching index params: %s", self._index_param_elem) state = { 'b_autotune': self._build_autotune, 'b_target_precision': self._build_target_precision, 'b_sample_frac': self._build_sample_frac, 'distance_method': self._distance_method, 'flann_build_params': self._flann_build_params, } self._index_param_elem.set_bytes(cPickle.dumps(state, -1)) self._pid = multiprocessing.current_process().pid
def fit(self, descriptors): """ Fit the ITQ model given the input set of descriptors :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None if self.logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds if not hasattr(descriptors, "__len__"): self._log.info("Creating sequence from iterable") descriptors_l = [] rs = [0] * 7 for d in descriptors: descriptors_l.append(d) report_progress(self._log.debug, rs, dbg_report_interval) descriptors = descriptors_l self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix(descriptors, report_interval=dbg_report_interval) self._log.debug("descriptor matrix shape: %s", x.shape) self._log.debug("Info normalizing descriptors by factor: %s", self.normalize) x = self._norm_vector(x) self._log.info("Centering data") self.mean_vec = numpy.mean(x, axis=0) x -= self.mean_vec self._log.info("Computing PCA transformation") # numpy and matlab observation format is flipped, thus the added # transpose. self._log.debug("-- computing covariance") c = numpy.cov(x.transpose()) # Direct translation from UNC matlab code # - eigen vectors are the columns of ``pc`` self._log.debug('-- computing linalg.eig') l, pc = numpy.linalg.eig(c) # ordered by greatest eigenvalue magnitude, keeping top ``bit_len`` self._log.debug('-- computing top pairs') top_pairs = sorted(zip(l, pc.transpose()), key=lambda p: p[0], reverse=1)[:self.bit_length] # # Harry translation -- Uses singular values / vectors, not eigen # # - singular vectors are the rows of pc # pc, l, _ = numpy.linalg.svd(c) # top_pairs = sorted(zip(l, pc), # key=lambda p: p[0], # reverse=1 # )[:self.bit_length] # Eigen-vectors of top ``bit_len`` magnitude eigenvalues self._log.debug("-- top vector extraction") pc_top = numpy.array([p[1] for p in top_pairs]).transpose() self._log.debug("-- transform centered data by PC matrix") xx = numpy.dot(x, pc_top) self._log.info("Performing ITQ to find optimal rotation") c, self.rotation = self._find_itq_rotation(xx, self.itq_iterations) # De-adjust rotation with PC vector self.rotation = numpy.dot(pc_top, self.rotation) self.save_model() return c
def build_index(self, descriptors): """ Build the index over the descriptors data elements. Subsequent calls to this method should rebuild the index, not add to it. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptors elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # Not caring about restoring the index because we're just making a new # one self._log.info("Building new FLANN index") self._log.debug("Storing descriptors") self._descr_cache = list(descriptors) if not self._descr_cache: raise ValueError("No data provided in given iterable.") # Cache descriptors if we have a path if self._descr_cache_filepath: self._log.debug("Caching descriptors: %s", self._descr_cache_filepath) safe_create_dir(osp.dirname(self._descr_cache_filepath)) with open(self._descr_cache_filepath, 'wb') as f: cPickle.dump(self._descr_cache, f, -1) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning") } if self._build_autotune: params['algorithm'] = "autotuned" if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._log.debug( "Accumulating descriptor vectors into matrix for FLANN") pts_array = elements_to_matrix(self._descr_cache, report_interval=1.0) self._log.debug('Building FLANN index') self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index(pts_array, **params) del pts_array self._log.debug("Caching index and state: %s, %s", self._index_filepath, self._index_param_filepath) if self._index_filepath: self._log.debug("Caching index: %s", self._index_filepath) safe_create_dir(osp.dirname(self._index_filepath)) self._flann.save_index(self._index_filepath) if self._index_param_filepath: self._log.debug("Caching index params: %s", self._index_param_filepath) state = { 'b_autotune': self._build_autotune, 'b_target_precision': self._build_target_precision, 'b_sample_frac': self._build_sample_frac, 'distance_method': self._distance_method, 'flann_build_params': self._flann_build_params, } safe_create_dir(osp.dirname(self._index_param_filepath)) with open(self._index_param_filepath, 'w') as f: cPickle.dump(state, f, -1) self._pid = multiprocessing.current_process().pid
def fit(self, descriptors, use_multiprocessing=True): """ Fit the ITQ model given the input set of descriptors. :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None if self.get_logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds if not isinstance(descriptors, Sequence): self._log.info("Creating sequence from iterable") descriptors_l = [] rs = [0] * 7 for d in descriptors: descriptors_l.append(d) report_progress(self._log.debug, rs, dbg_report_interval) descriptors = descriptors_l if len(descriptors[0].vector()) < self.bit_length: raise ValueError("Input descriptors have fewer features than " "requested bit encoding. Hash codes will be " "smaller than requested due to PCA decomposition " "result being bound by number of features.") self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix(descriptors, report_interval=dbg_report_interval, use_multiprocessing=use_multiprocessing) self._log.debug("descriptor matrix shape: %s", x.shape) self._log.debug("Info normalizing descriptors by factor: %s", self.normalize) x = self._norm_vector(x) self._log.info("Centering data") self.mean_vec = numpy.mean(x, axis=0) x -= self.mean_vec self._log.info("Computing PCA transformation") self._log.debug("-- computing covariance") # ``cov`` wants each row to be a feature and each column an observation # of those features. Thus, each column should be a descriptor vector, # thus we need the transpose here. c = numpy.cov(x.transpose()) if True: # Direct translation from UNC matlab code # - eigen vectors are the columns of ``pc`` self._log.debug('-- computing linalg.eig') l, pc = numpy.linalg.eig(c) self._log.debug('-- ordering eigen vectors by descending eigen ' 'value') else: # Harry translation -- Uses singular values / vectors, not eigen # - singular vectors are the columns of pc self._log.debug('-- computing linalg.svd') pc, l, _ = numpy.linalg.svd(c) self._log.debug('-- ordering singular vectors by descending ' 'singular value') # Same ordering method for both eig/svd sources. l_pc_ordered = sorted(zip(l, pc.transpose()), key=lambda p: p[0], reverse=1) self._log.debug("-- top vector extraction") # Only keep the top ``bit_length`` vectors after ordering by descending # value magnitude. # - Transposing vectors back to column-vectors. pc_top = numpy.array([p[1] for p in l_pc_ordered[:self.bit_length]])\ .transpose() self._log.debug("-- project centered data by PC matrix") v = numpy.dot(x, pc_top) self._log.info("Performing ITQ to find optimal rotation") c, self.rotation = self._find_itq_rotation(v, self.itq_iterations) # De-adjust rotation with PC vector self.rotation = numpy.dot(pc_top, self.rotation) self.save_model() return c
def _build_multiple_trees(self, chunk_size=CHUNK_SIZE): """ Build an MRPT structure """ sample = next(self._descriptor_set.iterdescriptors()) sample_v = sample.vector() n = self.count() d = sample_v.size leaf_size = n / (1 << self._depth) self._log.debug( "Building %d trees (T) of depth %d (l) from %g descriptors (N) " "of length %g", self._num_trees, self._depth, n, d) self._log.debug( "Leaf size (L = N/2^l) ~ %g/2^%d = %g", n, self._depth, leaf_size) self._log.debug( "UUIDs stored (T*N) = %g * %g = %g", self._num_trees, n, self._num_trees*n) self._log.debug( "Examined UUIDs (T*L) ~ %g * %g = %g", self._num_trees, leaf_size, self._num_trees*leaf_size) self._log.debug( "Examined/DB size (T*L/N = T/2^l) ~ %g/%g = %.3f", self._num_trees*leaf_size, n, self._num_trees*leaf_size/n) if (1 << self._depth) > n: self._log.warn( "There are insufficient elements (%d < 2^%d) to populate " "all the leaves of the tree. Consider lowering the depth " "parameter.", n, self._depth) self._log.debug("Projecting onto random bases") # Build all the random bases and the projections at the same time # (_num_trees * _depth shouldn't really be that high -- if it is, # you're a monster) if self._rand_seed is not None: np.random.seed(self._rand_seed) random_bases = np.random.randn(self._num_trees, d, self._depth) projs = np.empty((n, self._num_trees, self._depth), dtype=np.float64) # Load the data in chunks (because n * d IS high) pts_array = np.empty((chunk_size, d), sample_v.dtype) # Enumerate the descriptors and div the index by the chunk size # (causes each loop to only deal with at most chunk_size descriptors at # a time). for k, g in groupby(enumerate(self._descriptor_set.iterdescriptors()), lambda pair: pair[0] // chunk_size): # Items are still paired so extract the descriptors chunk = list(desc for (i, desc) in g) # Take care of dangling end piece k_beg = k * chunk_size k_end = min((k+1) * chunk_size, n) k_len = k_end - k_beg # Run the descriptors through elements_to_matrix elements_to_matrix( chunk, mat=pts_array, report_interval=1.0, use_multiprocessing=self._use_multiprocessing) # Insert into projection matrix projs[k_beg:k_end] = pts_array[:k_len].dot(random_bases) del pts_array self._log.debug("Constructing trees") desc_ids = list(self._descriptor_set.keys()) # Start with no trees self._trees = [] for t in range(self._num_trees): # Array of splits is a packed tree splits = np.empty(((1 << self._depth) - 1,), np.float64) self._log.debug("Constructing tree #%d", t+1) # Build the tree & store it leaves = self._build_single_tree(projs[:, t], splits) leaves = [[desc_ids[idx] for idx in leaf] for leaf in leaves] self._trees.append({ 'random_basis': (random_bases[t]), 'splits': splits, 'leaves': leaves })
def build_index(self, descriptors): """ Build the index over the descriptor data elements. The first part of this method is equivalent to the compressITQ function from UNC-CH's implementation. :raises RuntimeError: A current data model is loaded, or the current CodeIndex is not empty. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # Halt if we are going to overwrite a loaded mean/rotation cache. if not (self._mean_vector is None and self._r is None): raise RuntimeError("Current ITQ model is not empty (cached mean / " "rotation). For the sake of protecting data, we " "are not proceeding.") # Halt if the code index currently isn't empty if self.count(): raise RuntimeError("Current CodeIndex instance is not empty. For " "the sake of protecting data, we are not " "proceeding.") self._log.debug("Using %d length bit-vectors", self._bit_len) # TODO: Sub-sample down descriptors to use for PCA + ITQ # - Harry was also working on an iterative training approach so # that we only have to have a limited number of vectors in # memory at a time. if self._rand_seed: numpy.random.seed(self._rand_seed) with SimpleTimer("Creating descriptor cache", self._log.info): #: :type: list[smqtk.representation.DescriptorElement] descr_cache = [] for d in descriptors: descr_cache.append(d) if not descr_cache: raise ValueError("No descriptors given!") with SimpleTimer("Creating matrix of descriptors for training", self._log.info): # Get non-memory vectors on separate processes and aggregate into # matrix. self._log.debug("Input elements: %d", len(descr_cache)) self._log.debug("Input elem size: %s", descr_cache[0].vector().size) dbg_report_interval = None if self.logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds x = elements_to_matrix(descr_cache, report_interval=dbg_report_interval) self._log.debug("descriptor matrix shape: %s", x.shape) with SimpleTimer("Centering data", self._log.info): # center the data, VERY IMPORTANT for ITQ to work self._mean_vector = numpy.mean(x, axis=0) x -= self._mean_vector if self._mean_vec_cache_filepath: with SimpleTimer("Saving mean vector", self._log.info): file_utils.safe_create_dir(osp.dirname(self._mean_vec_cache_filepath)) numpy.save(self._mean_vec_cache_filepath, self._mean_vector) # PCA with SimpleTimer("Computing PCA transformation", self._log.info): # numpy and matlab observation format is flipped, thus added # transpose self._log.debug("-- computing covariance") c = numpy.cov(x.transpose()) # Direct translation # - eigen vectors are the columns of ``pc`` self._log.debug('-- computing linalg.eig') l, pc = numpy.linalg.eig(c) # ordered by greatest eigenvalue magnitude, keeping top ``bit_len`` self._log.debug('-- computing top pairs') top_pairs = sorted(zip(l, pc.transpose()), key=lambda p: p[0], reverse=1 )[:self._bit_len] # # Harry translation -- Uses singular values / vectors, not eigen # # - singular vectors are the rows of pc # pc, l, _ = numpy.linalg.svd(c) # top_pairs = sorted(zip(l, pc), # key=lambda p: p[0], # reverse=1 # )[:self._bit_len] # Eigen-vectors of top ``bit_len`` magnitude eigenvalues self._log.debug("-- top vector extraction") pc_top = numpy.array([p[1] for p in top_pairs]).transpose() self._log.debug("-- transform centered data by PC matrix") xx = numpy.dot(x, pc_top) # ITQ to find optimal rotation. # `c` is the output codes for matrix `x` # `r` is the rotation found by ITQ with SimpleTimer("Performing ITQ to find optimal rotation", self._log.info): c, self._r = self._find_itq_rotation(xx, self._itq_iter_num) # De-adjust rotation with PC vector self._r = numpy.dot(pc_top, self._r) if self._rotation_cache_filepath: with SimpleTimer("Saving rotation matrix", self._log.info): file_utils.safe_create_dir(osp.dirname(self._rotation_cache_filepath)) numpy.save(self._rotation_cache_filepath, self._r) # Populating small-code index # - Converting bit-vectors proved faster than creating new codes over # again (~0.01s vs ~0.04s for 80 vectors). with SimpleTimer("Clearing code index", self._log.info): self._code_index.clear() with SimpleTimer("Converting bit-vectors into small codes, inserting " "into code index", self._log.info): self._code_index.add_many_descriptors( (bit_utils.bit_vector_to_int(c[i]), descr_cache[i]) for i in xrange(c.shape[0]) )
def _build_index(self, descriptors): """ Internal method to be implemented by sub-classes to build the index with the given descriptor data elements. Subsequent calls to this method should rebuild the current index. This method shall not add to the existing index nor raise an exception to as to protect the current index. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ with self._model_lock: # Not caring about restoring the index because we're just making a # new one. self._log.info("Building new FLANN index") self._log.debug("Caching descriptor elements") self._descr_cache = list(descriptors) # Cache descriptors if we have an element if self._descr_cache_elem and self._descr_cache_elem.writable(): self._log.debug("Caching descriptors: %s", self._descr_cache_elem) self._descr_cache_elem.set_bytes( cPickle.dumps(self._descr_cache, -1) ) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning") } if self._build_autotune: params['algorithm'] = "autotuned" if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._log.debug("Accumulating descriptor vectors into matrix for " "FLANN") pts_array = elements_to_matrix(self._descr_cache, report_interval=1.0) self._log.debug('Building FLANN index') self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index(pts_array, **params) del pts_array if self._index_elem and self._index_elem.writable(): self._log.debug("Caching index: %s", self._index_elem) # FLANN wants to write to a file, so make a temp file, then # read it in, putting bytes into element. fd, fp = tempfile.mkstemp() try: self._flann.save_index(fp) # Use the file descriptor to create the file object. # This avoids reopening the file and will automatically # close the file descriptor on exiting the with block. # fdopen() is required because in Python 2 open() does # not accept a file descriptor. with os.fdopen(fd, 'rb') as f: self._index_elem.set_bytes(f.read()) finally: os.remove(fp) if self._index_param_elem and self._index_param_elem.writable(): self._log.debug("Caching index params: %s", self._index_param_elem) state = { 'b_autotune': self._build_autotune, 'b_target_precision': self._build_target_precision, 'b_sample_frac': self._build_sample_frac, 'distance_method': self._distance_method, 'flann_build_params': self._flann_build_params, } self._index_param_elem.set_bytes(cPickle.dumps(state, -1)) self._pid = multiprocessing.current_process().pid
def train(self, positive_classes, negatives): """ Train the supervised SVM classifier model. The class label ``negative`` is reserved for the negative class. If a model is already loaded, we will raise an exception in order to prevent accidental overwrite. NOTE: This abstract method provides generalized error checking and should be called via ``super`` in implementing methods. :param positive_classes: Dictionary mapping positive class labels to iterables of DescriptorElement training examples. :type positive_classes: dict[collections.Hashable, collections.Iterable[smqtk.representation.DescriptorElement]] :param negatives: Iterable of negative DescriptorElement examples. :type negatives: collections.Iterable[smqtk.representation.DescriptorElement] :raises ValueError: The ``negative`` label was found in the ``positive_classes`` dictionary. This is reserved for the negative example class. :raises ValueError: There were no positive or negative examples. :raises RuntimeError: A model already exists in this instance.Following through with training would overwrite this model. Throwing an exception for information protection. """ super(LibSvmClassifier, self).train(positive_classes, negatives) # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting etm_ri = None param_debug = {'-q': ''} if self._log.getEffectiveLevel() <= logging.DEBUG: etm_ri = 1.0 param_debug = {} # Form libSVM problem input values self._log.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(positive_classes), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l self._log.debug('-- class %d (%s)', i, l) # requires a sequence, so making the iterable ``g`` a tuple g = positive_classes[l] if not isinstance(g, collections.Sequence): g = tuple(g) train_group_sizes.append(float(len(g))) x = elements_to_matrix(g, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([i]*x.shape[0]) train_vectors.extend(x.tolist()) del g, x self._log.debug('-- negatives (-1)') # Map integer SVM label to semantic label self.svm_label_map[-1] = self.NEGATIVE_LABEL # requires a sequence, so making the iterable ``negatives`` a tuple if not isinstance(negatives, collections.Sequence): negatives = tuple(negatives) negatives_size = float(len(negatives)) x = elements_to_matrix(negatives, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([-1]*x.shape[0]) train_vectors.extend(x.tolist()) del negatives, x self._log.debug("Training elements: %d labels, %d vectors", len(train_labels), len(train_vectors)) self._log.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Only need to calculate positive class weights when C-SVC type if '-s' not in params or int(params['-s']) == 0: for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): params['-w'+str(i)] = \ max(1.0, negatives_size / float(n)) self._log.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) self._log.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) self._log.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) self._log.debug("Training SVM model -- Done") if self.svm_label_map_fp: self._log.debug("saving file -- labels -- %s", self.svm_label_map_fp) with open(self.svm_label_map_fp, 'wb') as f: cPickle.dump(self.svm_label_map, f) if self.svm_model_fp: self._log.debug("saving file -- model -- %s", self.svm_model_fp) svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
def train(self, class_examples=None, **kwds): """ Train the supervised classifier model. If a model is already loaded, we will raise an exception in order to prevent accidental overwrite. If the same label is provided to both ``class_examples`` and ``kwds``, the examples given to the reference in ``kwds`` will prevail. :param class_examples: Dictionary mapping class labels to iterables of DescriptorElement training examples. :type class_examples: dict[collections.Hashable, collections.Iterable[smqtk.representation.DescriptorElement]] :param kwds: Keyword assignment of labels to iterables of DescriptorElement training examples. :type kwds: dict[str, collections.Iterable[smqtk.representation.DescriptorElement]] :raises ValueError: There were no class examples provided. :raises ValueError: Less than 2 classes were given. :raises RuntimeError: A model already exists in this instance.Following through with training would overwrite this model. Throwing an exception for information protection. """ class_examples = \ super(LibSvmClassifier, self).train(class_examples, **kwds) # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting etm_ri = None param_debug = {'-q': ''} if self._log.getEffectiveLevel() <= logging.DEBUG: etm_ri = 1.0 param_debug = {} # Form libSVM problem input values self._log.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] # number of examples per class self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l self._log.debug('-- class %d (%s)', i, l) # requires a sequence, so making the iterable ``g`` a tuple g = class_examples[l] if not isinstance(g, collections.Sequence): self._log.debug(' (expanding iterable into sequence)') g = tuple(g) train_group_sizes.append(float(len(g))) x = elements_to_matrix(g, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x assert len(train_labels) == len(train_vectors), \ "Count miss-match between parallel labels and descriptor vectors" \ "being sent to libSVM (%d != %d)" \ % (len(train_labels), len(train_vectors)) self._log.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Calculating class weights for C-SVC SVM if '-s' not in params or int(params['-s']) == 0: total_examples = sum(train_group_sizes) for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): # weight is the ratio of between number of other-class examples # to the number of examples in this class. other_class_examples = total_examples - n w = max(1.0, other_class_examples / float(n)) params['-w' + str(i)] = w self._log.debug("-- class '%s' weight: %s", self.svm_label_map[i], w) self._log.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) self._log.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) del train_vectors self._log.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) self._log.debug("Training SVM model -- Done") if self.svm_label_map_fp: self._log.debug("saving file -- labels -- %s", self.svm_label_map_fp) with open(self.svm_label_map_fp, 'wb') as f: cPickle.dump(self.svm_label_map, f, -1) if self.svm_model_fp: self._log.debug("saving file -- model -- %s", self.svm_model_fp) svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
def _train(self, class_examples, **extra_params): """ Internal method that trains the classifier implementation. This method is called after checking that there is not already a model trained, thus it can be assumed that no model currently exists. The class labels will have already been checked before entering this method, so it can be assumed that the ``class_examples`` will container at least two classes. :param class_examples: Dictionary mapping class labels to iterables of DescriptorElement training examples. :type class_examples: dict[collections.Hashable, collections.Iterable[smqtk.representation.DescriptorElement]] :param extra_params: Dictionary with extra parameters for training. This is not used by this implementation. :type extra_params: None | dict[basestring, object] """ # Offset from 0 for positive class labels to use # - not using label of 0 because we think libSVM wants positive labels CLASS_LABEL_OFFSET = 1 # Stuff for debug reporting etm_ri = None param_debug = {'-q': ''} if self._log.getEffectiveLevel() <= logging.DEBUG: etm_ri = 1.0 param_debug = {} # Form libSVM problem input values self._log.debug("Formatting problem input") train_labels = [] train_vectors = [] train_group_sizes = [] # number of examples per class self.svm_label_map = {} # Making SVM label assignment deterministic to alphabetic order for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET): # Map integer SVM label to semantic label self.svm_label_map[i] = l self._log.debug('-- class %d (%s)', i, l) # requires a sequence, so making the iterable ``g`` a tuple g = class_examples[l] if not isinstance(g, collections.Sequence): self._log.debug(' (expanding iterable into sequence)') g = tuple(g) train_group_sizes.append(float(len(g))) x = elements_to_matrix(g, report_interval=etm_ri) x = self._norm_vector(x) train_labels.extend([i] * x.shape[0]) train_vectors.extend(x.tolist()) del g, x assert len(train_labels) == len(train_vectors), \ "Count mismatch between parallel labels and descriptor vectors" \ "being sent to libSVM (%d != %d)" \ % (len(train_labels), len(train_vectors)) self._log.debug("Forming train params") #: :type: dict params = deepcopy(self.train_params) params.update(param_debug) # Calculating class weights if set to C-SVC type SVM if '-s' not in params or int(params['-s']) == 0: # (john.moeller): The weighting should probably be the geometric # mean of the number of examples over the classes divided by the # number of examples for the current class. gmean = scipy.stats.gmean(train_group_sizes) for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET): w = gmean / n params['-w' + str(i)] = w self._log.debug("-- class '%s' weight: %s", self.svm_label_map[i], w) self._log.debug("Making parameters obj") svm_params = svmutil.svm_parameter(self._gen_param_string(params)) self._log.debug("Creating SVM problem") svm_problem = svm.svm_problem(train_labels, train_vectors) del train_vectors self._log.debug("Training SVM model") self.svm_model = svmutil.svm_train(svm_problem, svm_params) self._log.debug("Training SVM model -- Done") if self.svm_label_map_elem and self.svm_label_map_elem.writable(): self._log.debug("saving labels to element (%s)", self.svm_label_map_elem) self.svm_label_map_elem.set_bytes( cPickle.dumps(self.svm_label_map, -1)) if self.svm_model_elem and self.svm_model_elem.writable(): self._log.debug("saving model to element (%s)", self.svm_model_elem) # LibSvm I/O only works with filepaths, thus the need for an # intermediate temporary file. fd, fp = tempfile.mkstemp() try: svmutil.svm_save_model(fp, self.svm_model) # Use the file descriptor to create the file object. # This avoids reopening the file and will automatically # close the file descriptor on exiting the with block. # fdopen() is required because in Python 2 open() does # not accept a file descriptor. with os.fdopen(fd, 'rb') as f: self.svm_model_elem.set_bytes(f.read()) finally: os.remove(fp)
def _build_multiple_trees(self, chunk_size=CHUNK_SIZE): """ Build an MRPT structure """ sample = next(self._descriptor_set.iterdescriptors()) sample_v = sample.vector() n = self.count() d = sample_v.size leaf_size = n / (1 << self._depth) self._log.debug( "Building %d trees (T) of depth %d (l) from %g descriptors (N) " "of length %g", self._num_trees, self._depth, n, d) self._log.debug("Leaf size (L = N/2^l) ~ %g/2^%d = %g", n, self._depth, leaf_size) self._log.debug("UUIDs stored (T*N) = %g * %g = %g", self._num_trees, n, self._num_trees * n) self._log.debug("Examined UUIDs (T*L) ~ %g * %g = %g", self._num_trees, leaf_size, self._num_trees * leaf_size) self._log.debug("Examined/DB size (T*L/N = T/2^l) ~ %g/%g = %.3f", self._num_trees * leaf_size, n, self._num_trees * leaf_size / n) if (1 << self._depth) > n: self._log.warn( "There are insufficient elements (%d < 2^%d) to populate " "all the leaves of the tree. Consider lowering the depth " "parameter.", n, self._depth) self._log.debug("Projecting onto random bases") # Build all the random bases and the projections at the same time # (_num_trees * _depth shouldn't really be that high -- if it is, # you're a monster) if self._rand_seed is not None: np.random.seed(self._rand_seed) random_bases = np.random.randn(self._num_trees, d, self._depth) projs = np.empty((n, self._num_trees, self._depth), dtype=np.float64) # Load the data in chunks (because n * d IS high) pts_array = np.empty((chunk_size, d), sample_v.dtype) # Enumerate the descriptors and div the index by the chunk size # (causes each loop to only deal with at most chunk_size descriptors at # a time). for k, g in groupby(enumerate(self._descriptor_set.iterdescriptors()), lambda pair: pair[0] // chunk_size): # Items are still paired so extract the descriptors chunk = list(desc for (i, desc) in g) # Take care of dangling end piece k_beg = k * chunk_size k_end = min((k + 1) * chunk_size, n) k_len = k_end - k_beg # Run the descriptors through elements_to_matrix elements_to_matrix(chunk, mat=pts_array, report_interval=1.0, use_multiprocessing=self._use_multiprocessing) # Insert into projection matrix projs[k_beg:k_end] = pts_array[:k_len].dot(random_bases) del pts_array self._log.debug("Constructing trees") desc_ids = list(self._descriptor_set.keys()) # Start with no trees self._trees = [] for t in range(self._num_trees): # Array of splits is a packed tree splits = np.empty(((1 << self._depth) - 1, ), np.float64) self._log.debug("Constructing tree #%d", t + 1) # Build the tree & store it leaves = self._build_single_tree(projs[:, t], splits) leaves = [[desc_ids[idx] for idx in leaf] for leaf in leaves] self._trees.append({ 'random_basis': (random_bases[t]), 'splits': splits, 'leaves': leaves })
def _nn(self, d, n=1): """ Internal method to be implemented by sub-classes to return the nearest `N` neighbors to the given descriptor element. When this internal method is called, we have already checked that there is a vector in ``d`` and our index is not empty. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ self._log.debug("generating hash for descriptor") d_v = d.vector() d_h = self.lsh_functor.get_hash(d_v) def comp_descr_dist(d2_v): return self._distance_function(d_v, d2_v) with self._model_lock: self._log.debug("getting near hashes") hi = self.hash_index if hi is None: # Make on-the-fly linear index hi = LinearHashIndex() # not calling ``build_index`` because we already have the int # hashes. hi.index = set(self.hash2uuids_kvstore.keys()) near_hashes, _ = hi.nn(d_h, n) self._log.debug("getting UUIDs of descriptors for nearby hashes") neighbor_uuids = [] for h_int in map(bit_vector_to_int_large, near_hashes): # If descriptor hash not in our map, we effectively skip it. # Get set of descriptor UUIDs for a hash code. #: :type: set[collections.Hashable] near_uuids = self.hash2uuids_kvstore.get(h_int, set()) # Accumulate matching descriptor UUIDs to a list. neighbor_uuids.extend(near_uuids) self._log.debug("-- matched %d UUIDs", len(neighbor_uuids)) self._log.debug("getting descriptors for neighbor_uuids") neighbors = \ list(self.descriptor_index.get_many_descriptors(neighbor_uuids)) # Done with model parts at this point, so releasing lock. self._log.debug("ordering descriptors via distance method '%s'", self.distance_method) self._log.debug('-- getting element vectors') neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0) self._log.debug('-- calculating distances') distances = list(map(comp_descr_dist, neighbor_vectors)) self._log.debug('-- ordering') ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) self._log.debug('-- slicing top n=%d', n) return list(zip(*(ordered[:n])))