Esempio n. 1
0
        def iter_tocompute_arrays():
            """ Yield descriptor vectors for classification elements that need
            computing yet.

            :rtype: typing.Generator[numpy.ndarray]
            """
            # Force into an iterator.
            descr_iterator = iter(descr_iter)
            # Running var for the index of final data element in input
            # iterator. This will be -1 or the value of the final index in the
            # parallel lists.
            last_i = -1
            # Make successive islices into iterator of descriptor elements to
            # produces batches. We end when there is nothing left being
            # returned by the iterator
            de_batch_list = \
                list(itertools.islice(descr_iterator, d_elem_batch))
            # Fully qualified path to this classifier implementation type. This
            # should be unique among concrete classifier implementations.
            self_name = f"{self.__module__}.{self.__class__.__name__}"
            while de_batch_list:
                # Get vectors from batch using implementation-level batch
                # aggregation methods where applicable.
                de_batch_vecs = \
                    DescriptorElement.get_many_vectors(de_batch_list)

                for d_elem, d_vec in zip(de_batch_list, de_batch_vecs):
                    d_uid = d_elem.uuid()
                    if d_vec is None:
                        raise ValueError(
                            "Encountered DescriptorElement with "
                            "no vector stored! (UID=`{}`)".format(d_uid))
                    c_elem_ = factory.new_classification(self_name, d_uid)
                    already_computed = \
                        not overwrite and c_elem_.has_classifications()
                    elem_and_status_q.append((c_elem_, already_computed))
                    if not already_computed:
                        # Classifications should be computed for this
                        # descriptor
                        log_debug(
                            "Yielding descriptor array with UID `{}` "
                            "for classification generation.".format(d_uid))
                        yield d_vec
                    else:
                        log_debug("Classification already generated for UID "
                                  "`{}`.".format(d_uid))

                last_i += len(de_batch_vecs)

                # Slice out the next batch of descriptor elements. This will be
                # empty if the iterator has been exhausted.
                de_batch_list = list(
                    itertools.islice(descr_iterator, d_elem_batch))

            end_of_iter[0] = last_i
Esempio n. 2
0
    def get_many_vectors(self, uuids: Iterable[Hashable]) -> List[Optional[np.ndarray]]:
        """
        Get underlying vectors of descriptors associated with given uuids.

        :param uuids: Iterable of descriptor UUIDs to query for.

        :raises: KeyError: When there is not a descriptor in this set for one
            or more input UIDs.

        :return: List of vectors for descriptors associated with given uuid
            values.

        """
        return DescriptorElement.get_many_vectors(
            self.get_many_descriptors(uuids)
        )
Esempio n. 3
0
 def _train(self, class_examples, **extra_params):
     # convert descriptor elements into combines ndarray with associated
     # label vector.
     vec_list = []
     label_list = []
     for label, examples in class_examples.items():
         label_vectors = \
             DescriptorElement.get_many_vectors(examples)
         # ``is`` or ``count`` method messes up when elements are np arrays.
         none_count = len([e for e in label_vectors if e is None])
         assert none_count == 0, \
             "Some descriptor elements for label {} did not contain " \
             "vectors! (n={})".format(label, none_count)
         vec_list.extend(label_vectors)
         label_list.extend([label] * len(label_vectors))
     vec_list = np.vstack(vec_list)
     self.fit(vec_list, label_list)
Esempio n. 4
0
 def build_index(self, descriptors: Iterable[DescriptorElement]) -> None:
     # Cache given descriptor element vectors into a matrix for use during
     # ``rank``.
     descr_elem_list = list(descriptors)
     if len(descr_elem_list) == 0:
         raise ValueError("No descriptor elements passed.")
     # note: this fails if multiple descriptor elements with the same UID
     #       are included. There will be None's present.
     descr_matrix = np.asarray(
         DescriptorElement.get_many_vectors(descr_elem_list))
     # If the result matrix is of dtype(object), then either some elements
     # did not have vectors or some vectors were not of congruent
     # dimensionality.
     if descr_matrix.dtype == np.dtype(object):
         raise ValueError("One or more descriptor elements did not have a "
                          "vector set or were of congruent dimensionality.")
     self._descr_elem_list = descr_elem_list
     self._descr_matrix = descr_matrix
Esempio n. 5
0
    def _descriptors_to_matrix(
        self, descriptors: List[DescriptorElement]
    ) -> Tuple[np.ndarray, Sequence[Hashable]]:
        """
        Extract an (n,d) array with the descriptor vectors in each row,
        and a corresponding list of uuids from the list of descriptors.

        :param descriptors: List descriptor elements to add to this
            index.

        :return: An (n,d) array of descriptors (d-dim descriptors in n
            rows), and the corresponding list of descriptor uuids.
        """
        new_uuids = [desc.uuid() for desc in descriptors]
        data = np.vstack(
            DescriptorElement.get_many_vectors(descriptors)).astype(np.float32)
        LOG.info(f"data shape, type: {data.shape}, {data.dtype}")
        LOG.info(f"# uuids: {len(new_uuids)}")
        return data, new_uuids
Esempio n. 6
0
    def build_index(self, descriptors: Iterable[DescriptorElement]) -> None:
        """
        Build the index based on the given iterable of descriptor elements.

        Subsequent calls to this method should rebuild the index, not add to
        it.

        :raises ValueError: No data available in the given iterable.

        :param descriptors:
            Iterable of descriptor elements to build index over.
        :type descriptors:
            collections.abc.Iterable[smqtk.representation.DescriptorElement]

        """
        # ordered cache of descriptors in our index.
        self._descr_cache = []
        # Reverse mapping of a descriptor's vector to its index in the cache
        # and subsequently in the distance kernel.
        self._descr2index = {}

        descriptors = list(descriptors)

        # matrix for creating distance kernel
        self._descr_matrix = numpy.array(
            DescriptorElement.get_many_vectors(descriptors))
        vector_iter = zip(descriptors, self._descr_matrix)

        for i, (d, v) in enumerate(vector_iter):
            self._descr_cache.append(d)
            self._descr2index[tuple(v)] = i

        # TODO: (?) For when we optimize SVM SV kernel computation
        # self._dist_kernel = \
        #    compute_distance_kernel(self._descr_matrix,
        #                            histogram_intersection_distance2,
        #                            row_wise=True)

        if self.descr_cache_fp:
            with open(self.descr_cache_fp, 'wb') as f:
                pickle.dump(self._descr_cache, f, -1)
Esempio n. 7
0
    def _nn(self,
            d: DescriptorElement,
            n: int = 1
            ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]:
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :param n: Number of nearest neighbors to find.

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.

        """
        # Parent template method already assures there is a vector stored in
        # the input.
        d_vector = d.vector()
        assert d_vector is not None
        # Reshape into a 1xD vector with float32 type, which is required for
        # use with FAISS search.
        q = d_vector[np.newaxis, :].astype(np.float32)
        LOG.debug("Received query for %d nearest neighbors", n)

        with self._model_lock:
            if self._faiss_index is None:
                raise RuntimeError("No index currently available to remove "
                                   "from.")

            # Attempt to set n-probe of an IVF index
            self._set_index_nprobe()

            s_dists: np.ndarray
            s_ids: np.ndarray
            s_dists, s_ids = self._faiss_index.search(
                q, k=min(n, self._faiss_index.ntotal))
            s_dists, s_ids = np.sqrt(s_dists[0, :]), s_ids[0, :]
            # Convert numpy.int64 type values into python integer values.
            # This is for compatibility when comparing values in some KVS
            # impls (postgres...).
            s_ids = s_ids.astype(object)
            # s_id (the FAISS index indices) can equal -1 if fewer than the
            # requested number of nearest neighbors is returned. In this case,
            # eliminate the -1 entries
            LOG.debug("Getting descriptor UIDs from idx2uid mapping.")
            uuids = list(
                self._idx2uid_kvs.get_many(
                    cast(Iterator[Hashable],
                         filter(lambda s_id_: s_id_ >= 0, s_ids))))
            if len(uuids) < n:
                warnings.warn(
                    f"Less than n={n} neighbors were retrieved from "
                    "the FAISS index instance. Maybe increase "
                    "nprobe if this is an IVF index?", RuntimeWarning)

            descriptors = tuple(
                self._descriptor_set.get_many_descriptors(uuids))

        LOG.debug("Min and max FAISS distances: %g, %g", min(s_dists),
                  max(s_dists))

        d_vectors = np.vstack(DescriptorElement.get_many_vectors(descriptors))
        d_dists = metrics.euclidean_distance(d_vectors, q)

        LOG.debug("Min and max descriptor distances: %g, %g", min(d_dists),
                  max(d_dists))

        order = d_dists.argsort()
        uuids, d_dists = zip(*((uuids[oidx], d_dists[oidx]) for oidx in order))

        LOG.debug("Returning query result of size %g", len(uuids))

        return descriptors, tuple(d_dists)
Esempio n. 8
0
    def _train(self, class_examples, **extra_params):
        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        param_debug = {'-q': ''}
        if LOG.getEffectiveLevel() <= logging.DEBUG:
            param_debug = {}

        # Form libSVM problem input values
        LOG.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []  # number of examples per class
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            LOG.debug('-- class %d (%s)', i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = class_examples[l]
            if not isinstance(g, collections.abc.Sequence):
                LOG.debug('   (expanding iterable into sequence)')
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = numpy.array(DescriptorElement.get_many_vectors(g))
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        assert len(train_labels) == len(train_vectors), \
            "Count mismatch between parallel labels and descriptor vectors" \
            "being sent to libSVM (%d != %d)" \
            % (len(train_labels), len(train_vectors))

        LOG.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Calculating class weights if set to C-SVC type SVM
        if '-s' not in params or int(params['-s']) == 0:
            # (john.moeller): The weighting should probably be the geometric
            # mean of the number of examples over the classes divided by the
            # number of examples for the current class.
            gmean = scipy.stats.gmean(train_group_sizes)
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                w = gmean / n
                params['-w' + str(i)] = w
                LOG.debug("-- class '%s' weight: %s", self.svm_label_map[i], w)

        LOG.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        LOG.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        del train_vectors
        LOG.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        LOG.debug("Training SVM model -- Done")

        if self.svm_label_map_elem and self.svm_label_map_elem.writable():
            LOG.debug("saving labels to element (%s)", self.svm_label_map_elem)
            self.svm_label_map_elem.set_bytes(
                pickle.dumps(self.svm_label_map, -1))
        if self.svm_model_elem and self.svm_model_elem.writable():
            LOG.debug("saving model to element (%s)", self.svm_model_elem)
            # LibSvm I/O only works with filepaths, thus the need for an
            # intermediate temporary file.
            fd, fp = tempfile.mkstemp()
            try:
                svmutil.svm_save_model(fp, self.svm_model)
                # Use the file descriptor to create the file object.
                # This avoids reopening the file and will automatically
                # close the file descriptor on exiting the with block.
                # fdopen() is required because in Python 2 open() does
                # not accept a file descriptor.
                with os.fdopen(fd, 'rb') as f:
                    self.svm_model_elem.set_bytes(f.read())
            finally:
                os.remove(fp)