Ejemplo n.º 1
0
    def _descriptors_to_matrix(self, descriptors):
        """
        Extract an (n,d) array with the descriptor vectors in each row,
        and a corresponding list of uuids from the list of descriptors.

        :param descriptors: List descriptor elements to add to this
            index.
        :type descriptors: list[smqtk.representation.DescriptorElement]

        :return: An (n,d) array of descriptors (d-dim descriptors in n
            rows), and the corresponding list of descriptor uuids.
        :rtype: (np.ndarray, list[collections.Hashable])
        """
        new_uuids = [desc.uuid() for desc in descriptors]
        sample_v = descriptors[0].vector()
        n, d = len(new_uuids), sample_v.size
        data = np.empty((n, d), dtype=np.float32)
        elements_to_matrix(
            descriptors, mat=data,
            use_multiprocessing=self.use_multiprocessing,
            report_interval=1.0,
        )
        self._log.info("data shape, type: %s, %s",
                       data.shape, data.dtype)
        self._log.info("# uuids: %d", n)
        return data, new_uuids
Ejemplo n.º 2
0
    def _build_faiss_model(self):
        sample = next(self._descriptor_set.iterdescriptors())
        sample_v = sample.vector()
        n, d = self.count(), sample_v.size

        data = np.empty((n, d), dtype=np.float32)
        elements_to_matrix(
            self._descriptor_set,
            mat=data,
            use_multiprocessing=self.use_multiprocessing,
            report_interval=1.0,
        )
        self._uuids = np.array(list(self._descriptor_set.keys()))
        self.faiss_flat = faiss.IndexFlatL2(d)

        if self.exhaustive:
            self._faiss_index = faiss.IndexIDMap(self.faiss_flat)
        else:
            nlist = 10000
            self._faiss_index = faiss.IndexIVFFlat(self.faiss_flat, d, nlist,
                                                   faiss.METRIC_L2)
            self._faiss_index.train(data)
            self._faiss_index.nprobe = 5000

        self._log.info("data shape, type: %s, %s", data.shape, data.dtype)
        self._log.info("uuid shape, type: %s, %s", self._uuids.shape,
                       self._uuids.dtype)
        self._faiss_index.add_with_ids(data, self._uuids)

        self._log.info("FAISS index has been constructed with %d vectors",
                       self._faiss_index.ntotal)
Ejemplo n.º 3
0
    def nn(self, d, n=1):
        super(FaissNearestNeighborsIndex, self).nn(d, n)

        q = d.vector().reshape(1, -1).astype(np.float32)

        self._log.debug("Received query for %d nearest neighbors", n)

        dists, ids = self._faiss_index.search(q, n)
        dists, ids = np.sqrt(dists).squeeze(), ids.squeeze()
        uuids = ids

        descriptors = tuple(self._descriptor_set.get_many_descriptors(uuids))
        d_vectors = elements_to_matrix(descriptors)
        d_dists = np.sqrt(((d_vectors - q)**2).sum(axis=1))

        order = dists.argsort()
        uuids, dists = list(
            zip(*((uuids[oidx], d_dists[oidx]) for oidx in order)))

        d_dists = d_dists[order]
        self._log.debug("Min and max FAISS distances: %g, %g", min(dists),
                        max(dists))
        self._log.debug("Min and max descriptor distances: %g, %g",
                        min(d_dists), max(d_dists))

        self._log.debug("Returning query result of size %g", len(uuids))

        return (descriptors, tuple(dists))
Ejemplo n.º 4
0
    def nn(self, d, n=1):
        """
        Return the nearest `N` neighbors to the given descriptor element.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        d_vec, _, d_sc = self.get_small_code(d)

        # Extract the `n` nearest codes to the code of the query descriptor
        # - a code may associate with multiple hits, but its a safe assumption
        #   that if we get the top `n` codes, which exist because there is at
        #   least one element in association with it,
        self._log.debug("fetching nearest %d codes", n)
        code_set = self._code_index.codes()
        # TODO: Optimize this step
        #: :type: list[int]
        near_codes = \
            heapq.nsmallest(n, code_set,
                            lambda e:
                                distance_functions.hamming_distance(d_sc, e)
                            )

        # Collect descriptors from subsequently farther away bins until we have
        # >= `n` descriptors, which we will more finely sort after this.
        #: :type: list[smqtk.representation.DescriptorElement]
        self._log.debug("Collecting descriptors from near codes")
        neighbors = []
        termination_count = min(n, self.count())
        for nc in near_codes:
            neighbors.extend(self._code_index.get_descriptors(nc))
            # Break out if we've collected >= `n` descriptors, as descriptors
            # from more distance codes are likely to not be any closer.
            if len(neighbors) >= termination_count:
                break

        # Compute fine-grain distance measurements for collected elements + sort
        self._log.debug("elements to numpy")
        neighbor_vectors = elements_to_matrix(neighbors,
                                              use_multiprocessing=False,
                                              report_interval=1)
        self._log.debug("Sorting descriptors: %d", len(neighbors))
        def comp_neighbor_dist(neighbor_vec):
            return self._dist_func(d_vec, neighbor_vec)
        distances = map(comp_neighbor_dist, neighbor_vectors)

        # Sort by distance, return top n
        self._log.debug("Forming output")
        ordered = sorted(zip(neighbors, distances), key=lambda p: p[1])
        neighbors, distances = zip(*(ordered[:n]))
        return neighbors, distances
Ejemplo n.º 5
0
    def nn(self, d, n=1):
        """
        Return the nearest `N` neighbors to the given descriptor element.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple
            of the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        super(LSHNearestNeighborIndex, self).nn(d, n)

        self._log.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v):
            return self._distance_function(d_v, d2_v)

        self._log.debug("getting near hashes")
        hi = self.hash_index
        # Make on-the-fly linear index if we weren't originally set with one
        if hi is None:
            hi = LinearHashIndex()
            # not calling ``build_index`` because we already have the int
            # hashes.
            with self._hash2uuid_lock:
                hi.index = numpy.array(self._hash2uuid.keys())
        hashes, hash_dists = hi.nn(d_h, n)

        self._log.debug("getting UUIDs of descriptors for nearby hashes")
        neighbor_uuids = []
        with self._hash2uuid_lock:
            for h_int in map(bit_vector_to_int_large, hashes):
                # If descriptor hash not in our map, we effectively skip it
                neighbor_uuids.extend(self._hash2uuid.get(h_int, ()))
        self._log.debug("-- matched %d UUIDs", len(neighbor_uuids))

        self._log.debug("getting descriptors for neighbor_uuids")
        neighbors = \
            list(self.descriptor_index.get_many_descriptors(neighbor_uuids))

        self._log.debug("ordering descriptors via distance method '%s'",
                        self.distance_method)
        self._log.debug('-- getting element vectors')
        neighbor_vectors = elements_to_matrix(neighbors,
                                              report_interval=1.0)
        self._log.debug('-- calculating distances')
        distances = map(comp_descr_dist, neighbor_vectors)
        self._log.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances),
                         key=lambda p: p[1])
        self._log.debug('-- slicing top n=%d', n)
        return zip(*(ordered[:n]))
Ejemplo n.º 6
0
    def nn(self, d, n=1):
        """
        Return the nearest `N` neighbors to the given descriptor element.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple
            of the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        super(LSHNearestNeighborIndex, self).nn(d, n)

        self._log.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v):
            return self._distance_function(d_v, d2_v)

        self._log.debug("getting near hashes")
        hi = self.hash_index
        if hi is None:
            # Make on-the-fly linear index
            hi = LinearHashIndex()
            # not calling ``build_index`` because we already have the int
            # hashes.
            hi.index = numpy.array(list(self.hash2uuids_kvstore.keys()))
        near_hashes, _ = hi.nn(d_h, n)

        self._log.debug("getting UUIDs of descriptors for nearby hashes")
        neighbor_uuids = []
        for h_int in map(bit_vector_to_int_large, near_hashes):
            # If descriptor hash not in our map, we effectively skip it
            #: :type: collections.Iterable
            near_uuids = self.hash2uuids_kvstore.get(h_int, ())
            neighbor_uuids.extend(near_uuids)
        self._log.debug("-- matched %d UUIDs", len(neighbor_uuids))

        self._log.debug("getting descriptors for neighbor_uuids")
        neighbors = \
            list(self.descriptor_index.get_many_descriptors(neighbor_uuids))

        self._log.debug("ordering descriptors via distance method '%s'",
                        self.distance_method)
        self._log.debug('-- getting element vectors')
        neighbor_vectors = elements_to_matrix(neighbors, report_interval=1.0)
        self._log.debug('-- calculating distances')
        distances = map(comp_descr_dist, neighbor_vectors)
        self._log.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances), key=lambda p: p[1])
        self._log.debug('-- slicing top n=%d', n)
        return zip(*(ordered[:n]))
Ejemplo n.º 7
0
    def fit(self, descriptors, use_multiprocessing=True):
        """
        Fit the ITQ model given the input set of descriptors

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param use_multiprocessing: If multiprocessing should be used, as
            opposed to threading, for collecting descriptor vectors from the
            provided iterable.
        :type use_multiprocessing: bool

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        if self.get_logger().getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
        if not hasattr(descriptors, "__len__"):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            pr = ProgressReporter(self._log.debug, dbg_report_interval).start()
            for d in descriptors:
                descriptors_l.append(d)
                dbg_report_interval and pr.increment_report()
            dbg_report_interval and pr.report()
            descriptors = descriptors_l
        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(descriptors,
                               report_interval=dbg_report_interval,
                               use_multiprocessing=use_multiprocessing)
        self._log.debug("descriptor matrix shape: %s", x.shape)
        n, dim = x.shape

        self._log.debug("Generating random projections")
        np.random.seed(self.random_seed)
        self.rps = np.random.randn(dim, self.bit_length)

        self._log.debug("Info normalizing descriptors with norm type: %s",
                        self.normalize)
        return self.get_hash(x)
Ejemplo n.º 8
0
    def fit(self, descriptors, use_multiprocessing=True):
        """
        Fit the ITQ model given the input set of descriptors

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param use_multiprocessing: If multiprocessing should be used, as
            opposed to threading, for collecting descriptor vectors from the
            provided iterable.
        :type use_multiprocessing: bool

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        if self.get_logger().getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
        if not hasattr(descriptors, "__len__"):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            rs = [0]*7
            for d in descriptors:
                descriptors_l.append(d)
                report_progress(self._log.debug, rs, dbg_report_interval)
            descriptors = descriptors_l
        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(
            descriptors, report_interval=dbg_report_interval,
            use_multiprocessing=use_multiprocessing)
        self._log.debug("descriptor matrix shape: %s", x.shape)
        n, dim = x.shape

        self._log.debug("Generating random projections")
        np.random.seed(self.random_seed)
        self.rps = np.random.randn(dim, self.bit_length)

        self._log.debug("Info normalizing descriptors with norm type: %s",
                        self.normalize)
        return self.get_hash(x)
Ejemplo n.º 9
0
    def _nn(self, d, n=1):
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        q = d.vector()[np.newaxis, :].astype(np.float32)

        self._log.debug("Received query for %d nearest neighbors", n)

        with self._model_lock:
            s_dists, s_ids = self._faiss_index.search(q, n)
            s_dists, s_ids = np.sqrt(s_dists[0, :]), s_ids[0, :]
            uuids = [self._idx2uid_kvs[s_id] for s_id in s_ids]

            descriptors = self._descriptor_set.get_many_descriptors(uuids)

        self._log.debug("Min and max FAISS distances: %g, %g",
                        min(s_dists), max(s_dists))

        descriptors = tuple(descriptors)
        d_vectors = elements_to_matrix(descriptors)
        d_dists = metrics.euclidean_distance(d_vectors, q)

        self._log.debug("Min and max descriptor distances: %g, %g",
                        min(d_dists), max(d_dists))

        order = d_dists.argsort()
        uuids, d_dists = zip(*((uuids[oidx], d_dists[oidx]) for oidx in order))

        self._log.debug("Returning query result of size %g", len(uuids))

        return descriptors, tuple(d_dists)
Ejemplo n.º 10
0
    def _nn(self, d, n=1):
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        self._log.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v):
            return self._distance_function(d_v, d2_v)

        with self._model_lock:
            self._log.debug("getting near hashes")
            hi = self.hash_index
            if hi is None:
                # Make on-the-fly linear index
                hi = LinearHashIndex()
                # not calling ``build_index`` because we already have the int
                # hashes.
                hi.index = numpy.array(list(self.hash2uuids_kvstore.keys()))
            near_hashes, _ = hi.nn(d_h, n)

            self._log.debug("getting UUIDs of descriptors for nearby hashes")
            neighbor_uuids = []
            for h_int in map(bit_vector_to_int_large, near_hashes):
                # If descriptor hash not in our map, we effectively skip it.
                # Get set of descriptor UUIDs for a hash code.
                #: :type: set[collections.Hashable]
                near_uuids = self.hash2uuids_kvstore.get(h_int, set())
                # Accumulate matching descriptor UUIDs to a list.
                neighbor_uuids.extend(near_uuids)
            self._log.debug("-- matched %d UUIDs", len(neighbor_uuids))

            self._log.debug("getting descriptors for neighbor_uuids")
            neighbors = \
                list(self.descriptor_index.get_many_descriptors(neighbor_uuids))

        # Done with model parts at this point, so releasing lock.

        self._log.debug("ordering descriptors via distance method '%s'",
                        self.distance_method)
        self._log.debug('-- getting element vectors')
        neighbor_vectors = elements_to_matrix(neighbors,
                                              report_interval=1.0)
        self._log.debug('-- calculating distances')
        distances = list(map(comp_descr_dist, neighbor_vectors))
        self._log.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances),
                         key=lambda p: p[1])
        self._log.debug('-- slicing top n=%d', n)
        return list(zip(*(ordered[:n])))
Ejemplo n.º 11
0
Archivo: itq.py Proyecto: dhandeo/SMQTK
    def fit(self, descriptors):
        """
        Fit the ITQ model given the input set of descriptors

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        if self.logger().getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
        if not hasattr(descriptors, "__len__"):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            rs = [0]*7
            for d in descriptors:
                descriptors_l.append(d)
                report_progress(self._log.debug, rs, dbg_report_interval)
            descriptors = descriptors_l
        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(descriptors, report_interval=dbg_report_interval)
        self._log.debug("descriptor matrix shape: %s", x.shape)

        self._log.debug("Info normalizing descriptors by factor: %s",
                        self.normalize)
        x = self._norm_vector(x)

        self._log.info("Centering data")
        self.mean_vec = numpy.mean(x, axis=0)
        x -= self.mean_vec

        self._log.info("Computing PCA transformation")
        # numpy and matlab observation format is flipped, thus the added
        # transpose.
        self._log.debug("-- computing covariance")
        c = numpy.cov(x.transpose())

        # Direct translation from UNC matlab code
        # - eigen vectors are the columns of ``pc``
        self._log.debug('-- computing linalg.eig')
        l, pc = numpy.linalg.eig(c)
        # ordered by greatest eigenvalue magnitude, keeping top ``bit_len``
        self._log.debug('-- computing top pairs')
        top_pairs = sorted(zip(l, pc.transpose()),
                           key=lambda p: p[0],
                           reverse=1
                           )[:self.bit_length]

        # # Harry translation -- Uses singular values / vectors, not eigen
        # # - singular vectors are the rows of pc
        # pc, l, _ = numpy.linalg.svd(c)
        # top_pairs = sorted(zip(l, pc),
        #                    key=lambda p: p[0],
        #                    reverse=1
        #                    )[:self.bit_length]

        # Eigen-vectors of top ``bit_len`` magnitude eigenvalues
        self._log.debug("-- top vector extraction")
        pc_top = numpy.array([p[1] for p in top_pairs]).transpose()
        self._log.debug("-- transform centered data by PC matrix")
        xx = numpy.dot(x, pc_top)

        self._log.info("Performing ITQ to find optimal rotation")
        c, self.rotation = self._find_itq_rotation(xx, self.itq_iterations)
        # De-adjust rotation with PC vector
        self.rotation = numpy.dot(pc_top, self.rotation)

        self.save_model()

        return c
Ejemplo n.º 12
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptors data elements.

        Subsequent calls to this method should rebuild the index, not add to it.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the old
                cache away.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptors elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        # Not caring about restoring the index because we're just making a new
        # one
        self._log.info("Building new FLANN index")

        self._log.debug("Storing descriptors")
        self._descr_cache = list(descriptors)
        if not self._descr_cache:
            raise ValueError("No data provided in given iterable.")
        # Cache descriptors if we have a path
        if self._descr_cache_filepath:
            self._log.debug("Caching descriptors: %s",
                            self._descr_cache_filepath)
            safe_create_dir(osp.dirname(self._descr_cache_filepath))
            with open(self._descr_cache_filepath, 'wb') as f:
                cPickle.dump(self._descr_cache, f, -1)

        params = {
            "target_precision": self._build_target_precision,
            "sample_fraction": self._build_sample_frac,
            "log_level": ("info"
                          if self._log.getEffectiveLevel() <= logging.DEBUG
                          else "warning")
        }
        if self._build_autotune:
            params['algorithm'] = "autotuned"
        if self._rand_seed is not None:
            params['random_seed'] = self._rand_seed
        pyflann.set_distance_type(self._distance_method)

        self._log.debug("Accumulating descriptor vectors into matrix for FLANN")
        pts_array = elements_to_matrix(self._descr_cache, report_interval=1.0)

        self._log.debug('Building FLANN index')
        self._flann = pyflann.FLANN()
        self._flann_build_params = self._flann.build_index(pts_array, **params)
        del pts_array

        self._log.debug("Caching index and state: %s, %s",
                        self._index_filepath, self._index_param_filepath)
        if self._index_filepath:
            self._log.debug("Caching index: %s", self._index_filepath)
            safe_create_dir(osp.dirname(self._index_filepath))
            self._flann.save_index(self._index_filepath)
        if self._index_param_filepath:
            self._log.debug("Caching index params: %s",
                            self._index_param_filepath)
            state = {
                'b_autotune': self._build_autotune,
                'b_target_precision': self._build_target_precision,
                'b_sample_frac': self._build_sample_frac,
                'distance_method': self._distance_method,
                'flann_build_params': self._flann_build_params,
            }
            safe_create_dir(osp.dirname(self._index_param_filepath))
            with open(self._index_param_filepath, 'w') as f:
                cPickle.dump(state, f, -1)

        self._pid = multiprocessing.current_process().pid
Ejemplo n.º 13
0
    def train(self, positive_classes, negatives):
        """
        Train the supervised SVM classifier model.

        The class label ``negative`` is reserved for the negative class.

        If a model is already loaded, we will raise an exception in order to
        prevent accidental overwrite.

        NOTE:
            This abstract method provides generalized error checking and
            should be called via ``super`` in implementing methods.

        :param positive_classes: Dictionary mapping positive class labels to
            iterables of DescriptorElement training examples.
        :type positive_classes:
            dict[collections.Hashable,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :param negatives: Iterable of negative DescriptorElement examples.
        :type negatives: collections.Iterable[smqtk.representation.DescriptorElement]

        :raises ValueError: The ``negative`` label was found in the
            ``positive_classes`` dictionary. This is reserved for the negative
            example class.
        :raises ValueError: There were no positive or negative examples.
        :raises RuntimeError: A model already exists in this instance.Following
            through with training would overwrite this model. Throwing an
            exception for information protection.


        """
        super(LibSvmClassifier, self).train(positive_classes, negatives)

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        etm_ri = None
        param_debug = {"-q": ""}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            etm_ri = 1.0
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(positive_classes), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug("-- class %d (%s)", i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = positive_classes[l]
            if not isinstance(g, collections.Sequence):
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = elements_to_matrix(g, report_interval=etm_ri)
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        self._log.debug("-- negatives (-1)")
        # Map integer SVM label to semantic label
        self.svm_label_map[-1] = self.NEGATIVE_LABEL
        # requires a sequence, so making the iterable ``negatives`` a tuple
        if not isinstance(negatives, collections.Sequence):
            negatives = tuple(negatives)
        negatives_size = float(len(negatives))
        x = elements_to_matrix(negatives, report_interval=etm_ri)
        x = self._norm_vector(x)
        train_labels.extend([-1] * x.shape[0])
        train_vectors.extend(x.tolist())
        del negatives, x

        self._log.debug(
            "Training elements: %d labels, %d vectors " "(should be the same)", len(train_labels), len(train_vectors)
        )

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Only need to calculate positive class weights when C-SVC type
        if "-s" not in params or int(params["-s"]) == 0:
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                params["-w" + str(i)] = max(1.0, negatives_size / float(n))

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_fp:
            self._log.debug("saving file -- labels -- %s", self.svm_label_map_fp)
            with open(self.svm_label_map_fp, "wb") as f:
                cPickle.dump(self.svm_label_map, f)
        if self.svm_model_fp:
            self._log.debug("saving file -- model -- %s", self.svm_model_fp)
            svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
Ejemplo n.º 14
0
    def train(self, class_examples=None, **kwds):
        """
        Train the supervised classifier model.

        If a model is already loaded, we will raise an exception in order to
        prevent accidental overwrite.

        If the same label is provided to both ``class_examples`` and ``kwds``,
        the examples given to the reference in ``kwds`` will prevail.

        :param class_examples: Dictionary mapping class labels to iterables of
            DescriptorElement training examples.
        :type class_examples: dict[collections.Hashable,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :param kwds: Keyword assignment of labels to iterables of
            DescriptorElement training examples.
        :type kwds: dict[str,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :raises ValueError: There were no class examples provided.
        :raises ValueError: Less than 2 classes were given.
        :raises RuntimeError: A model already exists in this instance.Following
            through with training would overwrite this model. Throwing an
            exception for information protection.

        """
        class_examples = \
            super(LibSvmClassifier, self).train(class_examples, **kwds)

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        etm_ri = None
        param_debug = {'-q': ''}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            etm_ri = 1.0
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []  # number of examples per class
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug('-- class %d (%s)', i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = class_examples[l]
            if not isinstance(g, collections.Sequence):
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = elements_to_matrix(g, report_interval=etm_ri)
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        assert len(train_labels) == len(train_vectors), \
            "Count miss-match between parallel labels and descriptor vectors" \
            "being sent to libSVM (%d != %d)" \
            % (len(train_labels), len(train_vectors))

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Calculating class weights for C-SVC SVM
        if '-s' not in params or int(params['-s']) == 0:
            total_examples = sum(train_group_sizes)
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                # weight is the ratio of between number of other-class examples
                # to the number of examples in this class.
                other_class_examples = total_examples - n
                w = max(1.0, other_class_examples / float(n))
                params['-w' + str(i)] = w
                self._log.debug("-- class '%s' weight: %s",
                                self.svm_label_map[i], w)

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_fp:
            self._log.debug("saving file -- labels -- %s",
                            self.svm_label_map_fp)
            with open(self.svm_label_map_fp, 'wb') as f:
                cPickle.dump(self.svm_label_map, f, -1)
        if self.svm_model_fp:
            self._log.debug("saving file -- model -- %s", self.svm_model_fp)
            svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
Ejemplo n.º 15
0
    def _train(self, class_examples, **extra_params):
        """
        Internal method that trains the classifier implementation.

        This method is called after checking that there is not already a model
        trained, thus it can be assumed that no model currently exists.

        The class labels will have already been checked before entering this
        method, so it can be assumed that the ``class_examples`` will container
        at least two classes.

        :param class_examples: Dictionary mapping class labels to iterables of
            DescriptorElement training examples.
        :type class_examples: dict[collections.Hashable,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :param extra_params: Dictionary with extra parameters for training.
            This is not used by this implementation.
        :type extra_params: None | dict[basestring, object]

        """

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        etm_ri = None
        param_debug = {'-q': ''}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            etm_ri = 1.0
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []  # number of examples per class
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug('-- class %d (%s)', i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = class_examples[l]
            if not isinstance(g, collections.Sequence):
                self._log.debug('   (expanding iterable into sequence)')
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = elements_to_matrix(g, report_interval=etm_ri)
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        assert len(train_labels) == len(train_vectors), \
            "Count mismatch between parallel labels and descriptor vectors" \
            "being sent to libSVM (%d != %d)" \
            % (len(train_labels), len(train_vectors))

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Calculating class weights if set to C-SVC type SVM
        if '-s' not in params or int(params['-s']) == 0:
            # (john.moeller): The weighting should probably be the geometric
            # mean of the number of examples over the classes divided by the
            # number of examples for the current class.
            gmean = scipy.stats.gmean(train_group_sizes)
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                w = gmean / n
                params['-w' + str(i)] = w
                self._log.debug("-- class '%s' weight: %s",
                                self.svm_label_map[i], w)

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        del train_vectors
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_elem and self.svm_label_map_elem.writable():
            self._log.debug("saving labels to element (%s)",
                            self.svm_label_map_elem)
            self.svm_label_map_elem.set_bytes(
                cPickle.dumps(self.svm_label_map, -1)
            )
        if self.svm_model_elem and self.svm_model_elem.writable():
            self._log.debug("saving model to element (%s)",
                            self.svm_model_elem)
            # LibSvm I/O only works with filepaths, thus the need for an
            # intermediate temporary file.
            fd, fp = tempfile.mkstemp()
            try:
                svmutil.svm_save_model(fp, self.svm_model)
                # Use the file descriptor to create the file object.
                # This avoids reopening the file and will automatically
                # close the file descriptor on exiting the with block.
                # fdopen() is required because in Python 2 open() does
                # not accept a file descriptor.
                with os.fdopen(fd, 'rb') as f:
                    self.svm_model_elem.set_bytes(f.read())
            finally:
                os.remove(fp)
Ejemplo n.º 16
0
Archivo: itq.py Proyecto: Kitware/SMQTK
    def fit(self, descriptors, use_multiprocessing=True):
        """
        Fit the ITQ model given the input set of descriptors.

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param use_multiprocessing: If multiprocessing should be used, as
            opposed to threading, when collecting descriptor elements from the
            given iterable.
        :type use_multiprocessing: bool

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = 1.0
        dbg_report = self.get_logger().getEffectiveLevel() <= logging.DEBUG
        if not isinstance(descriptors, Sequence):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            pr = ProgressReporter(self._log.debug, dbg_report_interval).start()
            for d in descriptors:
                descriptors_l.append(d)
                dbg_report and pr.increment_report()
            dbg_report and pr.report()
            descriptors = descriptors_l
        if len(descriptors[0].vector()) < self.bit_length:
            raise ValueError("Input descriptors have fewer features than "
                             "requested bit encoding. Hash codes will be "
                             "smaller than requested due to PCA decomposition "
                             "result being bound by number of features.")

        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(descriptors, report_interval=dbg_report_interval,
                               use_multiprocessing=use_multiprocessing)
        self._log.debug("descriptor matrix shape: %s", x.shape)

        self._log.debug("Info normalizing descriptors by factor: %s",
                        self.normalize)
        x = self._norm_vector(x)

        self._log.info("Centering data")
        self.mean_vec = numpy.mean(x, axis=0)
        x -= self.mean_vec

        self._log.info("Computing PCA transformation")
        self._log.debug("-- computing covariance")
        # ``cov`` wants each row to be a feature and each column an observation
        # of those features. Thus, each column should be a descriptor vector,
        # thus we need the transpose here.
        c = numpy.cov(x.transpose())

        if True:
            # Direct translation from UNC matlab code
            # - eigen vectors are the columns of ``pc``
            self._log.debug('-- computing linalg.eig')
            l, pc = numpy.linalg.eig(c)
            self._log.debug('-- ordering eigen vectors by descending eigen '
                            'value')
        else:
            # Harry translation -- Uses singular values / vectors, not eigen
            # - singular vectors are the columns of pc
            self._log.debug('-- computing linalg.svd')
            pc, l, _ = numpy.linalg.svd(c)
            self._log.debug('-- ordering singular vectors by descending '
                            'singular value')

        # Same ordering method for both eig/svd sources.
        l_pc_ordered = sorted(zip(l, pc.transpose()), key=lambda _p: _p[0],
                              reverse=True)

        self._log.debug("-- top vector extraction")
        # Only keep the top ``bit_length`` vectors after ordering by descending
        # value magnitude.
        # - Transposing vectors back to column-vectors.
        pc_top = numpy.array([p[1] for p in l_pc_ordered[:self.bit_length]])\
            .transpose()
        self._log.debug("-- project centered data by PC matrix")
        v = numpy.dot(x, pc_top)

        self._log.info("Performing ITQ to find optimal rotation")
        c, self.rotation = self._find_itq_rotation(v, self.itq_iterations)
        # De-adjust rotation with PC vector
        self.rotation = numpy.dot(pc_top, self.rotation)

        self.save_model()

        return c
Ejemplo n.º 17
0
    def _build_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to build the index
        with the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the
                old cache away.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            # Not caring about restoring the index because we're just making a
            # new one.
            self._log.info("Building new FLANN index")

            self._log.debug("Caching descriptor elements")
            self._descr_cache = list(descriptors)
            # Cache descriptors if we have an element
            if self._descr_cache_elem and self._descr_cache_elem.writable():
                self._log.debug("Caching descriptors: %s",
                                self._descr_cache_elem)
                self._descr_cache_elem.set_bytes(
                    cPickle.dumps(self._descr_cache, -1))

            params = {
                "target_precision":
                self._build_target_precision,
                "sample_fraction":
                self._build_sample_frac,
                "log_level":
                ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else
                 "warning")
            }
            if self._build_autotune:
                params['algorithm'] = "autotuned"
            if self._rand_seed is not None:
                params['random_seed'] = self._rand_seed
            pyflann.set_distance_type(self._distance_method)

            self._log.debug("Accumulating descriptor vectors into matrix for "
                            "FLANN")
            pts_array = elements_to_matrix(self._descr_cache,
                                           report_interval=1.0)

            self._log.debug('Building FLANN index')
            self._flann = pyflann.FLANN()
            self._flann_build_params = self._flann.build_index(
                pts_array, **params)
            del pts_array

            if self._index_elem and self._index_elem.writable():
                self._log.debug("Caching index: %s", self._index_elem)
                # FLANN wants to write to a file, so make a temp file, then
                # read it in, putting bytes into element.
                fd, fp = tempfile.mkstemp()
                try:
                    self._flann.save_index(fp)
                    # Use the file descriptor to create the file object.
                    # This avoids reopening the file and will automatically
                    # close the file descriptor on exiting the with block.
                    # fdopen() is required because in Python 2 open() does
                    # not accept a file descriptor.
                    with os.fdopen(fd, 'rb') as f:
                        self._index_elem.set_bytes(f.read())
                finally:
                    os.remove(fp)
            if self._index_param_elem and self._index_param_elem.writable():
                self._log.debug("Caching index params: %s",
                                self._index_param_elem)
                state = {
                    'b_autotune': self._build_autotune,
                    'b_target_precision': self._build_target_precision,
                    'b_sample_frac': self._build_sample_frac,
                    'distance_method': self._distance_method,
                    'flann_build_params': self._flann_build_params,
                }
                self._index_param_elem.set_bytes(cPickle.dumps(state, -1))

            self._pid = multiprocessing.current_process().pid
Ejemplo n.º 18
0
    def fit(self, descriptors):
        """
        Fit the ITQ model given the input set of descriptors

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        if self.logger().getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
        if not hasattr(descriptors, "__len__"):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            rs = [0] * 7
            for d in descriptors:
                descriptors_l.append(d)
                report_progress(self._log.debug, rs, dbg_report_interval)
            descriptors = descriptors_l
        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(descriptors,
                               report_interval=dbg_report_interval)
        self._log.debug("descriptor matrix shape: %s", x.shape)

        self._log.debug("Info normalizing descriptors by factor: %s",
                        self.normalize)
        x = self._norm_vector(x)

        self._log.info("Centering data")
        self.mean_vec = numpy.mean(x, axis=0)
        x -= self.mean_vec

        self._log.info("Computing PCA transformation")
        # numpy and matlab observation format is flipped, thus the added
        # transpose.
        self._log.debug("-- computing covariance")
        c = numpy.cov(x.transpose())

        # Direct translation from UNC matlab code
        # - eigen vectors are the columns of ``pc``
        self._log.debug('-- computing linalg.eig')
        l, pc = numpy.linalg.eig(c)
        # ordered by greatest eigenvalue magnitude, keeping top ``bit_len``
        self._log.debug('-- computing top pairs')
        top_pairs = sorted(zip(l, pc.transpose()),
                           key=lambda p: p[0],
                           reverse=1)[:self.bit_length]

        # # Harry translation -- Uses singular values / vectors, not eigen
        # # - singular vectors are the rows of pc
        # pc, l, _ = numpy.linalg.svd(c)
        # top_pairs = sorted(zip(l, pc),
        #                    key=lambda p: p[0],
        #                    reverse=1
        #                    )[:self.bit_length]

        # Eigen-vectors of top ``bit_len`` magnitude eigenvalues
        self._log.debug("-- top vector extraction")
        pc_top = numpy.array([p[1] for p in top_pairs]).transpose()
        self._log.debug("-- transform centered data by PC matrix")
        xx = numpy.dot(x, pc_top)

        self._log.info("Performing ITQ to find optimal rotation")
        c, self.rotation = self._find_itq_rotation(xx, self.itq_iterations)
        # De-adjust rotation with PC vector
        self.rotation = numpy.dot(pc_top, self.rotation)

        self.save_model()

        return c
Ejemplo n.º 19
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptors data elements.

        Subsequent calls to this method should rebuild the index, not add to it.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the old
                cache away.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptors elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        # Not caring about restoring the index because we're just making a new
        # one
        self._log.info("Building new FLANN index")

        self._log.debug("Storing descriptors")
        self._descr_cache = list(descriptors)
        if not self._descr_cache:
            raise ValueError("No data provided in given iterable.")
        # Cache descriptors if we have a path
        if self._descr_cache_filepath:
            self._log.debug("Caching descriptors: %s",
                            self._descr_cache_filepath)
            safe_create_dir(osp.dirname(self._descr_cache_filepath))
            with open(self._descr_cache_filepath, 'wb') as f:
                cPickle.dump(self._descr_cache, f, -1)

        params = {
            "target_precision":
            self._build_target_precision,
            "sample_fraction":
            self._build_sample_frac,
            "log_level":
            ("info"
             if self._log.getEffectiveLevel() <= logging.DEBUG else "warning")
        }
        if self._build_autotune:
            params['algorithm'] = "autotuned"
        if self._rand_seed is not None:
            params['random_seed'] = self._rand_seed
        pyflann.set_distance_type(self._distance_method)

        self._log.debug(
            "Accumulating descriptor vectors into matrix for FLANN")
        pts_array = elements_to_matrix(self._descr_cache, report_interval=1.0)

        self._log.debug('Building FLANN index')
        self._flann = pyflann.FLANN()
        self._flann_build_params = self._flann.build_index(pts_array, **params)
        del pts_array

        self._log.debug("Caching index and state: %s, %s",
                        self._index_filepath, self._index_param_filepath)
        if self._index_filepath:
            self._log.debug("Caching index: %s", self._index_filepath)
            safe_create_dir(osp.dirname(self._index_filepath))
            self._flann.save_index(self._index_filepath)
        if self._index_param_filepath:
            self._log.debug("Caching index params: %s",
                            self._index_param_filepath)
            state = {
                'b_autotune': self._build_autotune,
                'b_target_precision': self._build_target_precision,
                'b_sample_frac': self._build_sample_frac,
                'distance_method': self._distance_method,
                'flann_build_params': self._flann_build_params,
            }
            safe_create_dir(osp.dirname(self._index_param_filepath))
            with open(self._index_param_filepath, 'w') as f:
                cPickle.dump(state, f, -1)

        self._pid = multiprocessing.current_process().pid
Ejemplo n.º 20
0
    def fit(self, descriptors, use_multiprocessing=True):
        """
        Fit the ITQ model given the input set of descriptors.

        :param descriptors: Iterable of ``DescriptorElement`` vectors to fit
            the model to.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :raises RuntimeError: There is already a model loaded

        :return: Matrix hash codes for provided descriptors in order.
        :rtype: numpy.ndarray[bool]

        """
        if self.has_model():
            raise RuntimeError("Model components have already been loaded.")

        dbg_report_interval = None
        if self.get_logger().getEffectiveLevel() <= logging.DEBUG:
            dbg_report_interval = 1.0  # seconds
        if not isinstance(descriptors, Sequence):
            self._log.info("Creating sequence from iterable")
            descriptors_l = []
            rs = [0] * 7
            for d in descriptors:
                descriptors_l.append(d)
                report_progress(self._log.debug, rs, dbg_report_interval)
            descriptors = descriptors_l
        if len(descriptors[0].vector()) < self.bit_length:
            raise ValueError("Input descriptors have fewer features than "
                             "requested bit encoding. Hash codes will be "
                             "smaller than requested due to PCA decomposition "
                             "result being bound by number of features.")

        self._log.info("Creating matrix of descriptors for fitting")
        x = elements_to_matrix(descriptors,
                               report_interval=dbg_report_interval,
                               use_multiprocessing=use_multiprocessing)
        self._log.debug("descriptor matrix shape: %s", x.shape)

        self._log.debug("Info normalizing descriptors by factor: %s",
                        self.normalize)
        x = self._norm_vector(x)

        self._log.info("Centering data")
        self.mean_vec = numpy.mean(x, axis=0)
        x -= self.mean_vec

        self._log.info("Computing PCA transformation")
        self._log.debug("-- computing covariance")
        # ``cov`` wants each row to be a feature and each column an observation
        # of those features. Thus, each column should be a descriptor vector,
        # thus we need the transpose here.
        c = numpy.cov(x.transpose())

        if True:
            # Direct translation from UNC matlab code
            # - eigen vectors are the columns of ``pc``
            self._log.debug('-- computing linalg.eig')
            l, pc = numpy.linalg.eig(c)
            self._log.debug('-- ordering eigen vectors by descending eigen '
                            'value')
        else:
            # Harry translation -- Uses singular values / vectors, not eigen
            # - singular vectors are the columns of pc
            self._log.debug('-- computing linalg.svd')
            pc, l, _ = numpy.linalg.svd(c)
            self._log.debug('-- ordering singular vectors by descending '
                            'singular value')

        # Same ordering method for both eig/svd sources.
        l_pc_ordered = sorted(zip(l, pc.transpose()),
                              key=lambda p: p[0],
                              reverse=1)

        self._log.debug("-- top vector extraction")
        # Only keep the top ``bit_length`` vectors after ordering by descending
        # value magnitude.
        # - Transposing vectors back to column-vectors.
        pc_top = numpy.array([p[1] for p in l_pc_ordered[:self.bit_length]])\
            .transpose()
        self._log.debug("-- project centered data by PC matrix")
        v = numpy.dot(x, pc_top)

        self._log.info("Performing ITQ to find optimal rotation")
        c, self.rotation = self._find_itq_rotation(v, self.itq_iterations)
        # De-adjust rotation with PC vector
        self.rotation = numpy.dot(pc_top, self.rotation)

        self.save_model()

        return c
Ejemplo n.º 21
0
    def _build_multiple_trees(self, chunk_size=CHUNK_SIZE):
        """
        Build an MRPT structure
        """
        sample = next(self._descriptor_set.iterdescriptors())
        sample_v = sample.vector()
        n = self.count()
        d = sample_v.size
        leaf_size = n / (1 << self._depth)

        self._log.debug(
            "Building %d trees (T) of depth %d (l) from %g descriptors (N) "
            "of length %g",
            self._num_trees, self._depth, n, d)
        self._log.debug(
            "Leaf size             (L = N/2^l)  ~ %g/2^%d = %g",
            n, self._depth, leaf_size)
        self._log.debug(
            "UUIDs stored                (T*N)  = %g * %g = %g",
            self._num_trees, n, self._num_trees*n)
        self._log.debug(
            "Examined UUIDs              (T*L)  ~ %g * %g = %g",
            self._num_trees, leaf_size, self._num_trees*leaf_size)
        self._log.debug(
            "Examined/DB size  (T*L/N = T/2^l)  ~ %g/%g = %.3f",
            self._num_trees*leaf_size, n, self._num_trees*leaf_size/n)

        if (1 << self._depth) > n:
            self._log.warn(
                "There are insufficient elements (%d < 2^%d) to populate "
                "all the leaves of the tree. Consider lowering the depth "
                "parameter.", n, self._depth)

        self._log.debug("Projecting onto random bases")
        # Build all the random bases and the projections at the same time
        # (_num_trees * _depth shouldn't really be that high -- if it is,
        # you're a monster)
        if self._rand_seed is not None:
            np.random.seed(self._rand_seed)
        random_bases = np.random.randn(self._num_trees, d, self._depth)
        projs = np.empty((n, self._num_trees, self._depth), dtype=np.float64)
        # Load the data in chunks (because n * d IS high)
        pts_array = np.empty((chunk_size, d), sample_v.dtype)
        # Enumerate the descriptors and div the index by the chunk size
        # (causes each loop to only deal with at most chunk_size descriptors at
        # a time).
        for k, g in groupby(enumerate(self._descriptor_set.iterdescriptors()),
                            lambda pair: pair[0] // chunk_size):
            # Items are still paired so extract the descriptors
            chunk = list(desc for (i, desc) in g)
            # Take care of dangling end piece
            k_beg = k * chunk_size
            k_end = min((k+1) * chunk_size, n)
            k_len = k_end - k_beg
            # Run the descriptors through elements_to_matrix
            elements_to_matrix(
                chunk, mat=pts_array, report_interval=1.0,
                use_multiprocessing=self._use_multiprocessing)
            # Insert into projection matrix
            projs[k_beg:k_end] = pts_array[:k_len].dot(random_bases)
        del pts_array

        self._log.debug("Constructing trees")
        desc_ids = list(self._descriptor_set.keys())
        # Start with no trees
        self._trees = []
        for t in range(self._num_trees):
            # Array of splits is a packed tree
            splits = np.empty(((1 << self._depth) - 1,), np.float64)

            self._log.debug("Constructing tree #%d", t+1)

            # Build the tree & store it
            leaves = self._build_single_tree(projs[:, t], splits)
            leaves = [[desc_ids[idx] for idx in leaf]
                      for leaf in leaves]
            self._trees.append({
                'random_basis': (random_bases[t]),
                'splits': splits,
                'leaves': leaves
            })
Ejemplo n.º 22
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptor data elements.

        The first part of this method is equivalent to the compressITQ function
        from UNC-CH's implementation.

        :raises RuntimeError: A current data model is loaded, or the current
            CodeIndex is not empty.
        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptor elements to build index over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        # Halt if we are going to overwrite a loaded mean/rotation cache.
        if not (self._mean_vector is None and self._r is None):
            raise RuntimeError("Current ITQ model is not empty (cached mean / "
                               "rotation). For the sake of protecting data, we "
                               "are not proceeding.")
        # Halt if the code index currently isn't empty
        if self.count():
            raise RuntimeError("Current CodeIndex instance is not empty. For "
                               "the sake of protecting data, we are not "
                               "proceeding.")

        self._log.debug("Using %d length bit-vectors", self._bit_len)

        # TODO: Sub-sample down descriptors to use for PCA + ITQ
        #       - Harry was also working on an iterative training approach so
        #           that we only have to have a limited number of vectors in
        #           memory at a time.
        if self._rand_seed:
            numpy.random.seed(self._rand_seed)

        with SimpleTimer("Creating descriptor cache", self._log.info):
            #: :type: list[smqtk.representation.DescriptorElement]
            descr_cache = []
            for d in descriptors:
                descr_cache.append(d)
            if not descr_cache:
                raise ValueError("No descriptors given!")
        with SimpleTimer("Creating matrix of descriptors for training",
                         self._log.info):
            # Get non-memory vectors on separate processes and aggregate into
            # matrix.
            self._log.debug("Input elements: %d", len(descr_cache))
            self._log.debug("Input elem size: %s", descr_cache[0].vector().size)
            dbg_report_interval = None
            if self.logger().getEffectiveLevel() <= logging.DEBUG:
                dbg_report_interval = 1.0  # seconds
            x = elements_to_matrix(descr_cache,
                                   report_interval=dbg_report_interval)
            self._log.debug("descriptor matrix shape: %s", x.shape)

        with SimpleTimer("Centering data", self._log.info):
            # center the data, VERY IMPORTANT for ITQ to work
            self._mean_vector = numpy.mean(x, axis=0)
            x -= self._mean_vector
        if self._mean_vec_cache_filepath:
            with SimpleTimer("Saving mean vector", self._log.info):
                file_utils.safe_create_dir(osp.dirname(self._mean_vec_cache_filepath))
                numpy.save(self._mean_vec_cache_filepath, self._mean_vector)

        # PCA
        with SimpleTimer("Computing PCA transformation", self._log.info):
            # numpy and matlab observation format is flipped, thus added
            # transpose
            self._log.debug("-- computing covariance")
            c = numpy.cov(x.transpose())

            # Direct translation
            # - eigen vectors are the columns of ``pc``
            self._log.debug('-- computing linalg.eig')
            l, pc = numpy.linalg.eig(c)
            # ordered by greatest eigenvalue magnitude, keeping top ``bit_len``
            self._log.debug('-- computing top pairs')
            top_pairs = sorted(zip(l, pc.transpose()),
                               key=lambda p: p[0],
                               reverse=1
                               )[:self._bit_len]

            # # Harry translation -- Uses singular values / vectors, not eigen
            # # - singular vectors are the rows of pc
            # pc, l, _ = numpy.linalg.svd(c)
            # top_pairs = sorted(zip(l, pc),
            #                    key=lambda p: p[0],
            #                    reverse=1
            #                    )[:self._bit_len]

            # Eigen-vectors of top ``bit_len`` magnitude eigenvalues
            self._log.debug("-- top vector extraction")
            pc_top = numpy.array([p[1] for p in top_pairs]).transpose()
            self._log.debug("-- transform centered data by PC matrix")
            xx = numpy.dot(x, pc_top)

        # ITQ to find optimal rotation.
        #   `c` is the output codes for matrix `x`
        #   `r` is the rotation found by ITQ
        with SimpleTimer("Performing ITQ to find optimal rotation",
                         self._log.info):
            c, self._r = self._find_itq_rotation(xx, self._itq_iter_num)
            # De-adjust rotation with PC vector
            self._r = numpy.dot(pc_top, self._r)
        if self._rotation_cache_filepath:
            with SimpleTimer("Saving rotation matrix", self._log.info):
                file_utils.safe_create_dir(osp.dirname(self._rotation_cache_filepath))
                numpy.save(self._rotation_cache_filepath, self._r)

        # Populating small-code index
        #   - Converting bit-vectors proved faster than creating new codes over
        #       again (~0.01s vs ~0.04s for 80 vectors).
        with SimpleTimer("Clearing code index", self._log.info):
            self._code_index.clear()
        with SimpleTimer("Converting bit-vectors into small codes, inserting "
                         "into code index", self._log.info):
            self._code_index.add_many_descriptors(
                (bit_utils.bit_vector_to_int(c[i]), descr_cache[i])
                for i in xrange(c.shape[0])
            )
Ejemplo n.º 23
0
    def _build_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to build the index
        with the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the
                old cache away.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            # Not caring about restoring the index because we're just making a
            # new one.
            self._log.info("Building new FLANN index")

            self._log.debug("Caching descriptor elements")
            self._descr_cache = list(descriptors)
            # Cache descriptors if we have an element
            if self._descr_cache_elem and self._descr_cache_elem.writable():
                self._log.debug("Caching descriptors: %s",
                                self._descr_cache_elem)
                self._descr_cache_elem.set_bytes(
                    cPickle.dumps(self._descr_cache, -1)
                )

            params = {
                "target_precision": self._build_target_precision,
                "sample_fraction": self._build_sample_frac,
                "log_level": ("info"
                              if self._log.getEffectiveLevel() <= logging.DEBUG
                              else "warning")
            }
            if self._build_autotune:
                params['algorithm'] = "autotuned"
            if self._rand_seed is not None:
                params['random_seed'] = self._rand_seed
            pyflann.set_distance_type(self._distance_method)

            self._log.debug("Accumulating descriptor vectors into matrix for "
                            "FLANN")
            pts_array = elements_to_matrix(self._descr_cache,
                                           report_interval=1.0)

            self._log.debug('Building FLANN index')
            self._flann = pyflann.FLANN()
            self._flann_build_params = self._flann.build_index(pts_array,
                                                               **params)
            del pts_array

            if self._index_elem and self._index_elem.writable():
                self._log.debug("Caching index: %s", self._index_elem)
                # FLANN wants to write to a file, so make a temp file, then
                # read it in, putting bytes into element.
                fd, fp = tempfile.mkstemp()
                try:
                    self._flann.save_index(fp)
                    # Use the file descriptor to create the file object.
                    # This avoids reopening the file and will automatically
                    # close the file descriptor on exiting the with block.
                    # fdopen() is required because in Python 2 open() does
                    # not accept a file descriptor.
                    with os.fdopen(fd, 'rb') as f:
                        self._index_elem.set_bytes(f.read())
                finally:
                    os.remove(fp)
            if self._index_param_elem and self._index_param_elem.writable():
                self._log.debug("Caching index params: %s",
                                self._index_param_elem)
                state = {
                    'b_autotune': self._build_autotune,
                    'b_target_precision': self._build_target_precision,
                    'b_sample_frac': self._build_sample_frac,
                    'distance_method': self._distance_method,
                    'flann_build_params': self._flann_build_params,
                }
                self._index_param_elem.set_bytes(cPickle.dumps(state, -1))

            self._pid = multiprocessing.current_process().pid
Ejemplo n.º 24
0
    def train(self, positive_classes, negatives):
        """
        Train the supervised SVM classifier model.

        The class label ``negative`` is reserved for the negative class.

        If a model is already loaded, we will raise an exception in order to
        prevent accidental overwrite.

        NOTE:
            This abstract method provides generalized error checking and
            should be called via ``super`` in implementing methods.

        :param positive_classes: Dictionary mapping positive class labels to
            iterables of DescriptorElement training examples.
        :type positive_classes:
            dict[collections.Hashable,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :param negatives: Iterable of negative DescriptorElement examples.
        :type negatives: collections.Iterable[smqtk.representation.DescriptorElement]

        :raises ValueError: The ``negative`` label was found in the
            ``positive_classes`` dictionary. This is reserved for the negative
            example class.
        :raises ValueError: There were no positive or negative examples.
        :raises RuntimeError: A model already exists in this instance.Following
            through with training would overwrite this model. Throwing an
            exception for information protection.


        """
        super(LibSvmClassifier, self).train(positive_classes, negatives)

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        etm_ri = None
        param_debug = {'-q': ''}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            etm_ri = 1.0
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(positive_classes), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug('-- class %d (%s)', i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = positive_classes[l]
            if not isinstance(g, collections.Sequence):
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = elements_to_matrix(g, report_interval=etm_ri)
            x = self._norm_vector(x)
            train_labels.extend([i]*x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        self._log.debug('-- negatives (-1)')
        # Map integer SVM label to semantic label
        self.svm_label_map[-1] = self.NEGATIVE_LABEL
        # requires a sequence, so making the iterable ``negatives`` a tuple
        if not isinstance(negatives, collections.Sequence):
            negatives = tuple(negatives)
        negatives_size = float(len(negatives))
        x = elements_to_matrix(negatives, report_interval=etm_ri)
        x = self._norm_vector(x)
        train_labels.extend([-1]*x.shape[0])
        train_vectors.extend(x.tolist())
        del negatives, x

        self._log.debug("Training elements: %d labels, %d vectors",
                        len(train_labels), len(train_vectors))

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Only need to calculate positive class weights when C-SVC type
        if '-s' not in params or int(params['-s']) == 0:
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                params['-w'+str(i)] = \
                    max(1.0, negatives_size / float(n))

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_fp:
            self._log.debug("saving file -- labels -- %s", self.svm_label_map_fp)
            with open(self.svm_label_map_fp, 'wb') as f:
                cPickle.dump(self.svm_label_map, f)
        if self.svm_model_fp:
            self._log.debug("saving file -- model -- %s", self.svm_model_fp)
            svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
Ejemplo n.º 25
0
    def train(self, class_examples=None, **kwds):
        """
        Train the supervised classifier model.

        If a model is already loaded, we will raise an exception in order to
        prevent accidental overwrite.

        If the same label is provided to both ``class_examples`` and ``kwds``,
        the examples given to the reference in ``kwds`` will prevail.

        :param class_examples: Dictionary mapping class labels to iterables of
            DescriptorElement training examples.
        :type class_examples: dict[collections.Hashable,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :param kwds: Keyword assignment of labels to iterables of
            DescriptorElement training examples.
        :type kwds: dict[str,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :raises ValueError: There were no class examples provided.
        :raises ValueError: Less than 2 classes were given.
        :raises RuntimeError: A model already exists in this instance.Following
            through with training would overwrite this model. Throwing an
            exception for information protection.

        """
        class_examples = \
            super(LibSvmClassifier, self).train(class_examples, **kwds)

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        etm_ri = None
        param_debug = {'-q': ''}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            etm_ri = 1.0
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []  # number of examples per class
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug('-- class %d (%s)', i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = class_examples[l]
            if not isinstance(g, collections.Sequence):
                self._log.debug('   (expanding iterable into sequence)')
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = elements_to_matrix(g, report_interval=etm_ri)
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        assert len(train_labels) == len(train_vectors), \
            "Count miss-match between parallel labels and descriptor vectors" \
            "being sent to libSVM (%d != %d)" \
            % (len(train_labels), len(train_vectors))

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Calculating class weights for C-SVC SVM
        if '-s' not in params or int(params['-s']) == 0:
            total_examples = sum(train_group_sizes)
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                # weight is the ratio of between number of other-class examples
                # to the number of examples in this class.
                other_class_examples = total_examples - n
                w = max(1.0, other_class_examples / float(n))
                params['-w' + str(i)] = w
                self._log.debug("-- class '%s' weight: %s",
                                self.svm_label_map[i], w)

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        del train_vectors
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_fp:
            self._log.debug("saving file -- labels -- %s",
                            self.svm_label_map_fp)
            with open(self.svm_label_map_fp, 'wb') as f:
                cPickle.dump(self.svm_label_map, f, -1)
        if self.svm_model_fp:
            self._log.debug("saving file -- model -- %s", self.svm_model_fp)
            svmutil.svm_save_model(self.svm_model_fp, self.svm_model)
Ejemplo n.º 26
0
    def _train(self, class_examples, **extra_params):
        """
        Internal method that trains the classifier implementation.

        This method is called after checking that there is not already a model
        trained, thus it can be assumed that no model currently exists.

        The class labels will have already been checked before entering this
        method, so it can be assumed that the ``class_examples`` will container
        at least two classes.

        :param class_examples: Dictionary mapping class labels to iterables of
            DescriptorElement training examples.
        :type class_examples: dict[collections.Hashable,
                 collections.Iterable[smqtk.representation.DescriptorElement]]

        :param extra_params: Dictionary with extra parameters for training.
            This is not used by this implementation.
        :type extra_params: None | dict[basestring, object]

        """

        # Offset from 0 for positive class labels to use
        # - not using label of 0 because we think libSVM wants positive labels
        CLASS_LABEL_OFFSET = 1

        # Stuff for debug reporting
        etm_ri = None
        param_debug = {'-q': ''}
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            etm_ri = 1.0
            param_debug = {}

        # Form libSVM problem input values
        self._log.debug("Formatting problem input")
        train_labels = []
        train_vectors = []
        train_group_sizes = []  # number of examples per class
        self.svm_label_map = {}
        # Making SVM label assignment deterministic to alphabetic order
        for i, l in enumerate(sorted(class_examples), CLASS_LABEL_OFFSET):
            # Map integer SVM label to semantic label
            self.svm_label_map[i] = l

            self._log.debug('-- class %d (%s)', i, l)
            # requires a sequence, so making the iterable ``g`` a tuple
            g = class_examples[l]
            if not isinstance(g, collections.Sequence):
                self._log.debug('   (expanding iterable into sequence)')
                g = tuple(g)

            train_group_sizes.append(float(len(g)))
            x = elements_to_matrix(g, report_interval=etm_ri)
            x = self._norm_vector(x)
            train_labels.extend([i] * x.shape[0])
            train_vectors.extend(x.tolist())
            del g, x

        assert len(train_labels) == len(train_vectors), \
            "Count mismatch between parallel labels and descriptor vectors" \
            "being sent to libSVM (%d != %d)" \
            % (len(train_labels), len(train_vectors))

        self._log.debug("Forming train params")
        #: :type: dict
        params = deepcopy(self.train_params)
        params.update(param_debug)
        # Calculating class weights if set to C-SVC type SVM
        if '-s' not in params or int(params['-s']) == 0:
            # (john.moeller): The weighting should probably be the geometric
            # mean of the number of examples over the classes divided by the
            # number of examples for the current class.
            gmean = scipy.stats.gmean(train_group_sizes)
            for i, n in enumerate(train_group_sizes, CLASS_LABEL_OFFSET):
                w = gmean / n
                params['-w' + str(i)] = w
                self._log.debug("-- class '%s' weight: %s",
                                self.svm_label_map[i], w)

        self._log.debug("Making parameters obj")
        svm_params = svmutil.svm_parameter(self._gen_param_string(params))
        self._log.debug("Creating SVM problem")
        svm_problem = svm.svm_problem(train_labels, train_vectors)
        del train_vectors
        self._log.debug("Training SVM model")
        self.svm_model = svmutil.svm_train(svm_problem, svm_params)
        self._log.debug("Training SVM model -- Done")

        if self.svm_label_map_elem and self.svm_label_map_elem.writable():
            self._log.debug("saving labels to element (%s)",
                            self.svm_label_map_elem)
            self.svm_label_map_elem.set_bytes(
                cPickle.dumps(self.svm_label_map, -1))
        if self.svm_model_elem and self.svm_model_elem.writable():
            self._log.debug("saving model to element (%s)",
                            self.svm_model_elem)
            # LibSvm I/O only works with filepaths, thus the need for an
            # intermediate temporary file.
            fd, fp = tempfile.mkstemp()
            try:
                svmutil.svm_save_model(fp, self.svm_model)
                # Use the file descriptor to create the file object.
                # This avoids reopening the file and will automatically
                # close the file descriptor on exiting the with block.
                # fdopen() is required because in Python 2 open() does
                # not accept a file descriptor.
                with os.fdopen(fd, 'rb') as f:
                    self.svm_model_elem.set_bytes(f.read())
            finally:
                os.remove(fp)
Ejemplo n.º 27
0
    def _build_multiple_trees(self, chunk_size=CHUNK_SIZE):
        """
        Build an MRPT structure
        """
        sample = next(self._descriptor_set.iterdescriptors())
        sample_v = sample.vector()
        n = self.count()
        d = sample_v.size
        leaf_size = n / (1 << self._depth)

        self._log.debug(
            "Building %d trees (T) of depth %d (l) from %g descriptors (N) "
            "of length %g", self._num_trees, self._depth, n, d)
        self._log.debug("Leaf size             (L = N/2^l)  ~ %g/2^%d = %g", n,
                        self._depth, leaf_size)
        self._log.debug("UUIDs stored                (T*N)  = %g * %g = %g",
                        self._num_trees, n, self._num_trees * n)
        self._log.debug("Examined UUIDs              (T*L)  ~ %g * %g = %g",
                        self._num_trees, leaf_size,
                        self._num_trees * leaf_size)
        self._log.debug("Examined/DB size  (T*L/N = T/2^l)  ~ %g/%g = %.3f",
                        self._num_trees * leaf_size, n,
                        self._num_trees * leaf_size / n)

        if (1 << self._depth) > n:
            self._log.warn(
                "There are insufficient elements (%d < 2^%d) to populate "
                "all the leaves of the tree. Consider lowering the depth "
                "parameter.", n, self._depth)

        self._log.debug("Projecting onto random bases")
        # Build all the random bases and the projections at the same time
        # (_num_trees * _depth shouldn't really be that high -- if it is,
        # you're a monster)
        if self._rand_seed is not None:
            np.random.seed(self._rand_seed)
        random_bases = np.random.randn(self._num_trees, d, self._depth)
        projs = np.empty((n, self._num_trees, self._depth), dtype=np.float64)
        # Load the data in chunks (because n * d IS high)
        pts_array = np.empty((chunk_size, d), sample_v.dtype)
        # Enumerate the descriptors and div the index by the chunk size
        # (causes each loop to only deal with at most chunk_size descriptors at
        # a time).
        for k, g in groupby(enumerate(self._descriptor_set.iterdescriptors()),
                            lambda pair: pair[0] // chunk_size):
            # Items are still paired so extract the descriptors
            chunk = list(desc for (i, desc) in g)
            # Take care of dangling end piece
            k_beg = k * chunk_size
            k_end = min((k + 1) * chunk_size, n)
            k_len = k_end - k_beg
            # Run the descriptors through elements_to_matrix
            elements_to_matrix(chunk,
                               mat=pts_array,
                               report_interval=1.0,
                               use_multiprocessing=self._use_multiprocessing)
            # Insert into projection matrix
            projs[k_beg:k_end] = pts_array[:k_len].dot(random_bases)
        del pts_array

        self._log.debug("Constructing trees")
        desc_ids = list(self._descriptor_set.keys())
        # Start with no trees
        self._trees = []
        for t in range(self._num_trees):
            # Array of splits is a packed tree
            splits = np.empty(((1 << self._depth) - 1, ), np.float64)

            self._log.debug("Constructing tree #%d", t + 1)

            # Build the tree & store it
            leaves = self._build_single_tree(projs[:, t], splits)
            leaves = [[desc_ids[idx] for idx in leaf] for leaf in leaves]
            self._trees.append({
                'random_basis': (random_bases[t]),
                'splits': splits,
                'leaves': leaves
            })
Ejemplo n.º 28
0
    def _nn(self, d, n=1):
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        self._log.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v):
            return self._distance_function(d_v, d2_v)

        with self._model_lock:
            self._log.debug("getting near hashes")
            hi = self.hash_index
            if hi is None:
                # Make on-the-fly linear index
                hi = LinearHashIndex()
                # not calling ``build_index`` because we already have the int
                # hashes.
                hi.index = set(self.hash2uuids_kvstore.keys())
            near_hashes, _ = hi.nn(d_h, n)

            self._log.debug("getting UUIDs of descriptors for nearby hashes")
            neighbor_uuids = []
            for h_int in map(bit_vector_to_int_large, near_hashes):
                # If descriptor hash not in our map, we effectively skip it.
                # Get set of descriptor UUIDs for a hash code.
                #: :type: set[collections.Hashable]
                near_uuids = self.hash2uuids_kvstore.get(h_int, set())
                # Accumulate matching descriptor UUIDs to a list.
                neighbor_uuids.extend(near_uuids)
            self._log.debug("-- matched %d UUIDs", len(neighbor_uuids))

            self._log.debug("getting descriptors for neighbor_uuids")
            neighbors = \
                list(self.descriptor_index.get_many_descriptors(neighbor_uuids))

        # Done with model parts at this point, so releasing lock.

        self._log.debug("ordering descriptors via distance method '%s'",
                        self.distance_method)
        self._log.debug('-- getting element vectors')
        neighbor_vectors = elements_to_matrix(neighbors,
                                              report_interval=1.0)
        self._log.debug('-- calculating distances')
        distances = list(map(comp_descr_dist, neighbor_vectors))
        self._log.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances),
                         key=lambda p: p[1])
        self._log.debug('-- slicing top n=%d', n)
        return list(zip(*(ordered[:n])))