def _svm_model_hik_helper(i, j, i_feat, j_feat): """ HIK async compute helper """ log = logging.getLogger("_svm_model_hik_helper") log.debug("Computing HIK for [%d, %d]", i, j) # noinspection PyUnresolvedReferences ij_hik = histogram_intersection_distance(i_feat, j_feat) return ij_hik
def test_hi_input_format(self): # the general form method should be able to take any combination of # vectors and matrices, following documented rules. ntools.assert_equal( df.histogram_intersection_distance(self.v4, self.v3), 0.5) np.testing.assert_array_equal( df.histogram_intersection_distance(self.v2, self.m1), [0., 1., 0.5]) np.testing.assert_array_equal( df.histogram_intersection_distance(self.m1, self.v2), [0., 1., 0.5]) np.testing.assert_array_equal( df.histogram_intersection_distance(self.m1, self.m1), [0, 0, 0]) ntools.assert_raises(ValueError, df.histogram_intersection_distance, self.m1, self.m2)
def rank(self, pos, neg): """ Rank the currently indexed elements given ``pos`` positive and ``neg`` negative exemplar descriptor elements. :param pos: Iterable of positive exemplar DescriptorElement instances. This may be optional for some implementations. :type pos: collections.Iterable[smqtk.representation.DescriptorElement] :param neg: Iterable of negative exemplar DescriptorElement instances. This may be optional for some implementations. :type neg: collections.Iterable[smqtk.representation.DescriptorElement] :return: Map of indexed descriptor elements to a rank value between [0, 1] (inclusive) range, where a 1.0 means most relevant and 0.0 meaning least relevant. :rtype: dict[smqtk.representation.DescriptorElement, float] """ # Notes: # - Pos and neg exemplars may be in our index. # # SVM model training # # Copy pos descriptors into a set for repeated iteration #: :type: set[smqtk.representation.DescriptorElement] pos = set(pos) # Creating training matrix and labels train_labels = [] train_vectors = [] num_pos = 0 for d in pos: train_labels.append(+1) train_vectors.append(d.vector().tolist()) num_pos += 1 self._log.debug("Positives given: %d", num_pos) # When no negative examples are given, naively pick most distant example # in our dataset, using HI metric, for each positive example neg_autoselect = set() if not neg: self._log.info( "Auto-selecting negative examples. (%d per positive)", self._autoneg_select_ratio) # ``train_vectors`` only composed of positive examples at this point for p in pos: # where d is the distance vector to descriptor elements in cache d = histogram_intersection_distance(p.vector(), self._descr_matrix) # Scan vector for max distance index # - Allow variable number of maximally distance descriptors to # be picked per positive. m_set = {} # track most distance neighbors m_val = -float( 'inf') # track smallest distance of most distant neighbors for i in xrange(d.size): if d[i] > m_val: m_set[d[i]] = i if len(m_set) > self._autoneg_select_ratio: if m_val in m_set: del m_set[m_val] m_val = min(m_set) for i in m_set.itervalues(): neg_autoselect.add(self._descr_cache[i]) # Remove any positive examples from auto-selected results neg_autoselect.difference_update(pos) self._log.debug("Auto-selected negative descriptors [%d]: %s", len(neg_autoselect), neg_autoselect) num_neg = 0 for d in neg: train_labels.append(-1) train_vectors.append(d.vector().tolist()) num_neg += 1 for d in neg_autoselect: train_labels.append(-1) train_vectors.append(d.vector().tolist()) num_neg += 1 if not num_pos: raise ValueError("No positive examples provided.") elif not num_neg: raise ValueError("No negative examples provided.") # Training SVM model self._log.debug("online model training") svm_problem = svm.svm_problem(train_labels, train_vectors) svm_model = svmutil.svm_train( svm_problem, self._gen_svm_parameter_string(num_pos, num_neg)) if svm_model.l == 0: raise RuntimeError("SVM Model learning failed") # # Platt Scaling for probability rankings # self._log.debug("making test distance matrix") # Number of support vectors # Q: is this always the same as ``svm_model.l``? num_SVs = sum(svm_model.nSV[:svm_model.nr_class]) # Support vector dimensionality dim_SVs = len(train_vectors[0]) # initialize matrix they're going into svm_SVs = numpy.ndarray((num_SVs, dim_SVs), dtype=float) for i, nlist in enumerate(svm_model.SV[:svm_SVs.shape[0]]): svm_SVs[i, :] = [n.value for n in nlist[:len(train_vectors[0])]] # compute matrix of distances from support vectors to index elements # TODO: Optimize this step by caching SV distance vectors # - It is known that SVs are vectors from the training data, so # if the same descriptors are given to this function # repeatedly (which is the case for IQR), this can be faster # because we're only computing at most a few more distance # vectors against our indexed descriptor matrix, and the rest # have already been computed before. # - At worst, we're effectively doing this call because each SV # needs to have its distance vector computed. svm_test_k = compute_distance_matrix(svm_SVs, self._descr_matrix, histogram_intersection_distance, row_wise=True) self._log.debug("Platt scalling") # the actual platt scaling stuff weights = numpy.array(svm_model.get_sv_coef()).flatten() margins = numpy.dot(weights, svm_test_k) rho = svm_model.rho[0] probA = svm_model.probA[0] probB = svm_model.probB[0] #: :type: numpy.core.multiarray.ndarray probs = 1.0 / (1.0 + numpy.exp((margins - rho) * probA + probB)) # Detect whether we need to flip probabilities # - Probability of input positive examples should have a high # probability score among the generated probabilities of our index. # - If the positive example probabilities show to be in the lower 50%, # flip the generated probabilities, since its experimentally known # that the SVM will change which index it uses to represent a # particular class label occasionally, which influences the Platt # scaling apparently. pos_vectors = numpy.array(train_vectors[:num_pos]) pos_test_k = compute_distance_matrix(svm_SVs, pos_vectors, histogram_intersection_distance, row_wise=True) pos_margins = numpy.dot(weights, pos_test_k) #: :type: numpy.core.multiarray.ndarray pos_probs = 1.0 / (1.0 + numpy.exp((pos_margins - rho) * probA + probB)) # Check if average positive probability is less than the average index # probability. If so, the platt scaling probably needs to be flipped. if (pos_probs.sum() / pos_probs.size) < (probs.sum() / probs.size): self._log.debug("inverting probabilities") probs = 1. - probs rank_pool = dict(zip(self._descr_cache, probs)) return rank_pool
def generate_model(self, descriptor_map, parallel=None, **kwargs): """ Generate this indexers data-model using the given features, saving it to files in the configured data directory. :raises RuntimeError: Precaution error when there is an existing data model for this indexer. Manually delete or move the existing model before computing another one. Specific implementations may error on other things. See the specific implementations for more details. :raises ValueError: The given feature map had no content. :param descriptor_map: Mapping of integer IDs to feature data. All feature data must be of the same size! :type descriptor_map: dict of (int, numpy.core.multiarray.ndarray) :param parallel: Optionally specification of how many processors to use when pooling sub-tasks. If None, we attempt to use all available cores. :type parallel: int """ if self.has_model(): raise RuntimeError("WARNING: This implementation already has a " "model generated. These can take a long time to " "generate, thus we require external manual " "removal of modal files before we will generate " "a new model.") num_features = len(descriptor_map) ordered_uids = sorted(descriptor_map.keys()) sample_feature = descriptor_map[ordered_uids[0]] feature_len = len(sample_feature) # Pre-allocating arrays #: :type: list[collections.Hashable] self._uid_array = [] self._feature_mat = numpy.zeros( (num_features, feature_len), dtype=sample_feature.dtype ) self._distance_mat = numpy.zeros( (num_features, num_features), dtype=sample_feature.dtype ) with SimpleTimer("Populating feature matrix", self.log.info): for i, (uid, feat) in enumerate(descriptor_map.iteritems()): self._uid_array.append(uid) self._feature_mat[i] = feat with SimpleTimer("Computing HI matrix kernel", self.log.info): # Using [process] Pool here with large sets eats far too much RAM. # Using a ThreadPool here is actually much slower. Not sure why? for i in range(num_features): for j in range(i, num_features): self._distance_mat[i, j] = self._distance_mat[j, i] = \ histogram_intersection_distance(self._feature_mat[i], self._feature_mat[j]) with SimpleTimer("Saving data files", self.log.info): safe_create_dir(self.data_dir) with open(self.uid_list_filepath, 'wb') as ofile: cPickle.dump(self._uid_array, ofile) numpy.save(self.feature_mat_filepath, self._feature_mat) numpy.save(self.distance_mat_filepath, self._distance_mat)
def rank(self, pos, neg): """ Rank the currently indexed elements given ``pos`` positive and ``neg`` negative exemplar descriptor elements. :param pos: Iterable of positive exemplar DescriptorElement instances. This may be optional for some implementations. :type pos: collections.Iterable[smqtk.representation.DescriptorElement] :param neg: Iterable of negative exemplar DescriptorElement instances. This may be optional for some implementations. :type neg: collections.Iterable[smqtk.representation.DescriptorElement] :return: Map of indexed descriptor elements to a rank value between [0, 1] (inclusive) range, where a 1.0 means most relevant and 0.0 meaning least relevant. :rtype: dict[smqtk.representation.DescriptorElement, float] """ # Notes: # - Pos and neg exemplars may be in our index. # # SVM model training # # Copy pos descriptors into a set for repeated iteration #: :type: set[smqtk.representation.DescriptorElement] pos = set(pos) # Creating training matrix and labels train_labels = [] train_vectors = [] num_pos = 0 for d in pos: train_labels.append(+1) train_vectors.append(d.vector().tolist()) num_pos += 1 # When no negative examples are given, naively pick most distant example # in our dataset, using HI metric, for each positive example neg_autoselect = set() if not neg: self._log.info("Auto-selecting negative examples.") # ``train_vectors`` only composed of positive examples at this point for p in pos: # where d is the distance vector to descriptor elements in cache d = histogram_intersection_distance(p.vector(), self._descr_matrix) # Scan vector for max distance index # - Allow variable number of maximally distance descriptors to # be picked per positive. m_set = {} m_val = -1 for i in xrange(d.size): if d[i] > m_val: m_set[d[i]] = i if len(m_set) > self._autoneg_select_ratio: if m_val in m_set: del m_set[m_val] m_val = min(m_set) for i in m_set.itervalues(): neg_autoselect.add(self._descr_cache[i]) # Remove any positive examples from auto-selected results neg_autoselect.difference_update(pos) self._log.debug("Auto-selected negative descriptors: %s", neg_autoselect) num_neg = 0 for d in neg: train_labels.append(-1) train_vectors.append(d.vector().tolist()) num_neg += 1 for d in neg_autoselect: train_labels.append(-1) train_vectors.append(d.vector().tolist()) num_neg += 1 if not num_pos: raise ValueError("No positive examples provided.") elif not num_neg: raise ValueError("No negative examples provided.") # Training SVM model svm_problem = svm.svm_problem(train_labels, train_vectors) svm_model = svmutil.svm_train(svm_problem, self._gen_svm_parameter_string(num_pos, num_neg)) if svm_model.l == 0: raise RuntimeError("SVM Model learning failed") # # Platt Scaling for probability rankings # # Number of support vectors # Q: is this always the same as ``svm_model.l``? num_SVs = sum(svm_model.nSV[:svm_model.nr_class]) # Support vector dimensionality dim_SVs = len(train_vectors[0]) # initialize matrix they're going into svm_SVs = numpy.ndarray((num_SVs, dim_SVs), dtype=float) for i, nlist in enumerate(svm_model.SV[:svm_SVs.shape[0]]): svm_SVs[i, :] = [n.value for n in nlist[:len(train_vectors[0])]] # compute matrix of distances from support vectors to index elements # TODO: Optimize this step by caching SV distance vectors # - It is known that SVs are vectors from the training data, so # if the same descriptors are given to this function # repeatedly (which is the case for IQR), this can be faster # because we're only computing at most a few more distance # vectors against our indexed descriptor matrix, and the rest # have already been computed before. # - At worst, we're effectively doing this call because each SV # needs to have its distance vector computed. svm_test_k = compute_distance_matrix(svm_SVs, self._descr_matrix, histogram_intersection_distance, row_wise=True) # the actual platt scaling stuff weights = numpy.array(svm_model.get_sv_coef()).flatten() margins = numpy.dot(weights, svm_test_k) rho = svm_model.rho[0] probA = svm_model.probA[0] probB = svm_model.probB[0] #: :type: numpy.core.multiarray.ndarray probs = 1.0 / (1.0 + numpy.exp((margins - rho) * probA + probB)) # Detect whether we need to flip probabilities # - Probability of input positive examples should have a high # probability score among the generated probabilities of our index. # - If the positive example probabilities show to be in the lower 50%, # flip the generated probabilities, since its experimentally known # that the SVM will change which index it uses to represent a # particular class label occasionally, which influences the Platt # scaling apparently. pos_vectors = numpy.array(train_vectors[:num_pos]) pos_test_k = compute_distance_matrix(svm_SVs, pos_vectors, histogram_intersection_distance, row_wise=True) pos_margins = numpy.dot(weights, pos_test_k) #: :type: numpy.core.multiarray.ndarray pos_probs = 1.0 / (1.0 + numpy.exp((pos_margins - rho) * probA + probB)) # Check if average positive probability is less than the average index # probability. If so, the platt scaling probably needs to be flipped. if (pos_probs.sum() / pos_probs.size) < (probs.sum() / probs.size): self._log.debug("inverting probabilities") probs = 1. - probs rank_pool = dict(zip(self._descr_cache, probs)) return rank_pool