Esempio n. 1
0
def _svm_model_hik_helper(i, j, i_feat, j_feat):
    """
    HIK async compute helper
    """
    log = logging.getLogger("_svm_model_hik_helper")
    log.debug("Computing HIK for [%d, %d]", i, j)
    # noinspection PyUnresolvedReferences
    ij_hik = histogram_intersection_distance(i_feat, j_feat)
    return ij_hik
Esempio n. 2
0
    def test_hi_input_format(self):
        # the general form method should be able to take any combination of
        # vectors and matrices, following documented rules.

        ntools.assert_equal(
            df.histogram_intersection_distance(self.v4, self.v3), 0.5)

        np.testing.assert_array_equal(
            df.histogram_intersection_distance(self.v2, self.m1),
            [0., 1., 0.5])
        np.testing.assert_array_equal(
            df.histogram_intersection_distance(self.m1, self.v2),
            [0., 1., 0.5])

        np.testing.assert_array_equal(
            df.histogram_intersection_distance(self.m1, self.m1), [0, 0, 0])

        ntools.assert_raises(ValueError, df.histogram_intersection_distance,
                             self.m1, self.m2)
Esempio n. 3
0
    def rank(self, pos, neg):
        """
        Rank the currently indexed elements given ``pos`` positive and ``neg``
        negative exemplar descriptor elements.

        :param pos: Iterable of positive exemplar DescriptorElement instances.
            This may be optional for some implementations.
        :type pos: collections.Iterable[smqtk.representation.DescriptorElement]

        :param neg: Iterable of negative exemplar DescriptorElement instances.
            This may be optional for some implementations.
        :type neg: collections.Iterable[smqtk.representation.DescriptorElement]

        :return: Map of indexed descriptor elements to a rank value between
            [0, 1] (inclusive) range, where a 1.0 means most relevant and 0.0
            meaning least relevant.
        :rtype: dict[smqtk.representation.DescriptorElement, float]

        """
        # Notes:
        # - Pos and neg exemplars may be in our index.

        #
        # SVM model training
        #
        # Copy pos descriptors into a set for repeated iteration
        #: :type: set[smqtk.representation.DescriptorElement]
        pos = set(pos)
        # Creating training matrix and labels
        train_labels = []
        train_vectors = []
        num_pos = 0
        for d in pos:
            train_labels.append(+1)
            train_vectors.append(d.vector().tolist())
            num_pos += 1
        self._log.debug("Positives given: %d", num_pos)

        # When no negative examples are given, naively pick most distant example
        # in our dataset, using HI metric, for each positive example
        neg_autoselect = set()
        if not neg:
            self._log.info(
                "Auto-selecting negative examples. (%d per positive)",
                self._autoneg_select_ratio)
            # ``train_vectors`` only composed of positive examples at this point
            for p in pos:
                # where d is the distance vector to descriptor elements in cache
                d = histogram_intersection_distance(p.vector(),
                                                    self._descr_matrix)
                # Scan vector for max distance index
                # - Allow variable number of maximally distance descriptors to
                #   be picked per positive.
                m_set = {}  # track most distance neighbors
                m_val = -float(
                    'inf')  # track smallest distance of most distant neighbors
                for i in xrange(d.size):
                    if d[i] > m_val:
                        m_set[d[i]] = i
                        if len(m_set) > self._autoneg_select_ratio:
                            if m_val in m_set:
                                del m_set[m_val]
                        m_val = min(m_set)
                for i in m_set.itervalues():
                    neg_autoselect.add(self._descr_cache[i])
            # Remove any positive examples from auto-selected results
            neg_autoselect.difference_update(pos)
            self._log.debug("Auto-selected negative descriptors [%d]: %s",
                            len(neg_autoselect), neg_autoselect)

        num_neg = 0
        for d in neg:
            train_labels.append(-1)
            train_vectors.append(d.vector().tolist())
            num_neg += 1
        for d in neg_autoselect:
            train_labels.append(-1)
            train_vectors.append(d.vector().tolist())
            num_neg += 1

        if not num_pos:
            raise ValueError("No positive examples provided.")
        elif not num_neg:
            raise ValueError("No negative examples provided.")

        # Training SVM model
        self._log.debug("online model training")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        svm_model = svmutil.svm_train(
            svm_problem, self._gen_svm_parameter_string(num_pos, num_neg))
        if svm_model.l == 0:
            raise RuntimeError("SVM Model learning failed")

        #
        # Platt Scaling for probability rankings
        #

        self._log.debug("making test distance matrix")
        # Number of support vectors
        # Q: is this always the same as ``svm_model.l``?
        num_SVs = sum(svm_model.nSV[:svm_model.nr_class])
        # Support vector dimensionality
        dim_SVs = len(train_vectors[0])
        # initialize matrix they're going into
        svm_SVs = numpy.ndarray((num_SVs, dim_SVs), dtype=float)
        for i, nlist in enumerate(svm_model.SV[:svm_SVs.shape[0]]):
            svm_SVs[i, :] = [n.value for n in nlist[:len(train_vectors[0])]]
        # compute matrix of distances from support vectors to index elements
        # TODO: Optimize this step by caching SV distance vectors
        #       - It is known that SVs are vectors from the training data, so
        #           if the same descriptors are given to this function
        #           repeatedly (which is the case for IQR), this can be faster
        #           because we're only computing at most a few more distance
        #           vectors against our indexed descriptor matrix, and the rest
        #           have already been computed before.
        #       - At worst, we're effectively doing this call because each SV
        #           needs to have its distance vector computed.
        svm_test_k = compute_distance_matrix(svm_SVs,
                                             self._descr_matrix,
                                             histogram_intersection_distance,
                                             row_wise=True)

        self._log.debug("Platt scalling")
        # the actual platt scaling stuff
        weights = numpy.array(svm_model.get_sv_coef()).flatten()
        margins = numpy.dot(weights, svm_test_k)
        rho = svm_model.rho[0]
        probA = svm_model.probA[0]
        probB = svm_model.probB[0]
        #: :type: numpy.core.multiarray.ndarray
        probs = 1.0 / (1.0 + numpy.exp((margins - rho) * probA + probB))

        # Detect whether we need to flip probabilities
        # - Probability of input positive examples should have a high
        #   probability score among the generated probabilities of our index.
        # - If the positive example probabilities show to be in the lower 50%,
        #   flip the generated probabilities, since its experimentally known
        #   that the SVM will change which index it uses to represent a
        #   particular class label occasionally, which influences the Platt
        #   scaling apparently.
        pos_vectors = numpy.array(train_vectors[:num_pos])
        pos_test_k = compute_distance_matrix(svm_SVs,
                                             pos_vectors,
                                             histogram_intersection_distance,
                                             row_wise=True)
        pos_margins = numpy.dot(weights, pos_test_k)
        #: :type: numpy.core.multiarray.ndarray
        pos_probs = 1.0 / (1.0 +
                           numpy.exp((pos_margins - rho) * probA + probB))
        # Check if average positive probability is less than the average index
        # probability. If so, the platt scaling probably needs to be flipped.
        if (pos_probs.sum() / pos_probs.size) < (probs.sum() / probs.size):
            self._log.debug("inverting probabilities")
            probs = 1. - probs

        rank_pool = dict(zip(self._descr_cache, probs))
        return rank_pool
Esempio n. 4
0
    def generate_model(self, descriptor_map, parallel=None, **kwargs):
        """
        Generate this indexers data-model using the given features,
        saving it to files in the configured data directory.

        :raises RuntimeError: Precaution error when there is an existing data
            model for this indexer. Manually delete or move the existing
            model before computing another one.

            Specific implementations may error on other things. See the specific
            implementations for more details.

        :raises ValueError: The given feature map had no content.

        :param descriptor_map: Mapping of integer IDs to feature data. All feature
            data must be of the same size!
        :type descriptor_map: dict of (int, numpy.core.multiarray.ndarray)

        :param parallel: Optionally specification of how many processors to use
            when pooling sub-tasks. If None, we attempt to use all available
            cores.
        :type parallel: int

        """
        if self.has_model():
            raise RuntimeError("WARNING: This implementation already has a "
                               "model generated. These can take a long time to "
                               "generate, thus we require external manual "
                               "removal of modal files before we will generate "
                               "a new model.")

        num_features = len(descriptor_map)
        ordered_uids = sorted(descriptor_map.keys())

        sample_feature = descriptor_map[ordered_uids[0]]
        feature_len = len(sample_feature)

        # Pre-allocating arrays
        #: :type: list[collections.Hashable]
        self._uid_array = []
        self._feature_mat = numpy.zeros(
            (num_features, feature_len), dtype=sample_feature.dtype
        )
        self._distance_mat = numpy.zeros(
            (num_features, num_features), dtype=sample_feature.dtype
        )

        with SimpleTimer("Populating feature matrix", self.log.info):
            for i, (uid, feat) in enumerate(descriptor_map.iteritems()):
                self._uid_array.append(uid)
                self._feature_mat[i] = feat

        with SimpleTimer("Computing HI matrix kernel", self.log.info):
            # Using [process] Pool here with large sets eats far too much RAM.
            # Using a ThreadPool here is actually much slower. Not sure why?
            for i in range(num_features):
                for j in range(i, num_features):
                    self._distance_mat[i, j] = self._distance_mat[j, i] = \
                        histogram_intersection_distance(self._feature_mat[i],
                                                        self._feature_mat[j])

        with SimpleTimer("Saving data files", self.log.info):
            safe_create_dir(self.data_dir)
            with open(self.uid_list_filepath, 'wb') as ofile:
                cPickle.dump(self._uid_array, ofile)
            numpy.save(self.feature_mat_filepath, self._feature_mat)
            numpy.save(self.distance_mat_filepath, self._distance_mat)
Esempio n. 5
0
    def rank(self, pos, neg):
        """
        Rank the currently indexed elements given ``pos`` positive and ``neg``
        negative exemplar descriptor elements.

        :param pos: Iterable of positive exemplar DescriptorElement instances.
            This may be optional for some implementations.
        :type pos: collections.Iterable[smqtk.representation.DescriptorElement]

        :param neg: Iterable of negative exemplar DescriptorElement instances.
            This may be optional for some implementations.
        :type neg: collections.Iterable[smqtk.representation.DescriptorElement]

        :return: Map of indexed descriptor elements to a rank value between
            [0, 1] (inclusive) range, where a 1.0 means most relevant and 0.0
            meaning least relevant.
        :rtype: dict[smqtk.representation.DescriptorElement, float]

        """
        # Notes:
        # - Pos and neg exemplars may be in our index.

        #
        # SVM model training
        #
        # Copy pos descriptors into a set for repeated iteration
        #: :type: set[smqtk.representation.DescriptorElement]
        pos = set(pos)
        # Creating training matrix and labels
        train_labels = []
        train_vectors = []
        num_pos = 0
        for d in pos:
            train_labels.append(+1)
            train_vectors.append(d.vector().tolist())
            num_pos += 1

        # When no negative examples are given, naively pick most distant example
        # in our dataset, using HI metric, for each positive example
        neg_autoselect = set()
        if not neg:
            self._log.info("Auto-selecting negative examples.")
            # ``train_vectors`` only composed of positive examples at this point
            for p in pos:
                # where d is the distance vector to descriptor elements in cache
                d = histogram_intersection_distance(p.vector(),
                                                    self._descr_matrix)
                # Scan vector for max distance index
                # - Allow variable number of maximally distance descriptors to
                #   be picked per positive.
                m_set = {}
                m_val = -1
                for i in xrange(d.size):
                    if d[i] > m_val:
                        m_set[d[i]] = i
                        if len(m_set) > self._autoneg_select_ratio:
                            if m_val in m_set:
                                del m_set[m_val]
                            m_val = min(m_set)
                for i in m_set.itervalues():
                    neg_autoselect.add(self._descr_cache[i])
            # Remove any positive examples from auto-selected results
            neg_autoselect.difference_update(pos)
            self._log.debug("Auto-selected negative descriptors: %s",
                            neg_autoselect)

        num_neg = 0
        for d in neg:
            train_labels.append(-1)
            train_vectors.append(d.vector().tolist())
            num_neg += 1
        for d in neg_autoselect:
            train_labels.append(-1)
            train_vectors.append(d.vector().tolist())
            num_neg += 1

        if not num_pos:
            raise ValueError("No positive examples provided.")
        elif not num_neg:
            raise ValueError("No negative examples provided.")

        # Training SVM model
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        svm_model = svmutil.svm_train(svm_problem,
                                      self._gen_svm_parameter_string(num_pos,
                                                                     num_neg))
        if svm_model.l == 0:
            raise RuntimeError("SVM Model learning failed")

        #
        # Platt Scaling for probability rankings
        #

        # Number of support vectors
        # Q: is this always the same as ``svm_model.l``?
        num_SVs = sum(svm_model.nSV[:svm_model.nr_class])
        # Support vector dimensionality
        dim_SVs = len(train_vectors[0])
        # initialize matrix they're going into
        svm_SVs = numpy.ndarray((num_SVs, dim_SVs), dtype=float)
        for i, nlist in enumerate(svm_model.SV[:svm_SVs.shape[0]]):
            svm_SVs[i, :] = [n.value for n in nlist[:len(train_vectors[0])]]
        # compute matrix of distances from support vectors to index elements
        # TODO: Optimize this step by caching SV distance vectors
        #       - It is known that SVs are vectors from the training data, so
        #           if the same descriptors are given to this function
        #           repeatedly (which is the case for IQR), this can be faster
        #           because we're only computing at most a few more distance
        #           vectors against our indexed descriptor matrix, and the rest
        #           have already been computed before.
        #       - At worst, we're effectively doing this call because each SV
        #           needs to have its distance vector computed.
        svm_test_k = compute_distance_matrix(svm_SVs, self._descr_matrix,
                                             histogram_intersection_distance,
                                             row_wise=True)

        # the actual platt scaling stuff
        weights = numpy.array(svm_model.get_sv_coef()).flatten()
        margins = numpy.dot(weights, svm_test_k)
        rho = svm_model.rho[0]
        probA = svm_model.probA[0]
        probB = svm_model.probB[0]
        #: :type: numpy.core.multiarray.ndarray
        probs = 1.0 / (1.0 + numpy.exp((margins - rho) * probA + probB))

        # Detect whether we need to flip probabilities
        # - Probability of input positive examples should have a high
        #   probability score among the generated probabilities of our index.
        # - If the positive example probabilities show to be in the lower 50%,
        #   flip the generated probabilities, since its experimentally known
        #   that the SVM will change which index it uses to represent a
        #   particular class label occasionally, which influences the Platt
        #   scaling apparently.
        pos_vectors = numpy.array(train_vectors[:num_pos])
        pos_test_k = compute_distance_matrix(svm_SVs, pos_vectors,
                                             histogram_intersection_distance,
                                             row_wise=True)
        pos_margins = numpy.dot(weights, pos_test_k)
        #: :type: numpy.core.multiarray.ndarray
        pos_probs = 1.0 / (1.0 + numpy.exp((pos_margins - rho) * probA + probB))
        # Check if average positive probability is less than the average index
        # probability. If so, the platt scaling probably needs to be flipped.
        if (pos_probs.sum() / pos_probs.size) < (probs.sum() / probs.size):
            self._log.debug("inverting probabilities")
            probs = 1. - probs

        rank_pool = dict(zip(self._descr_cache, probs))
        return rank_pool