Example #1
0
    def _load_flann_model(self):
        if not self._descr_cache and not self._descr_cache_elem.is_empty():
            # Load descriptor cache
            # - is copied on fork, so only need to load here.
            self._log.debug("Loading cached descriptors")
            self._descr_cache = \
                cPickle.loads(self._descr_cache_elem.get_bytes())

        # Params pickle include the build params + our local state params
        if self._index_param_elem and not self._index_param_elem.is_empty():
            state = cPickle.loads(self._index_param_elem.get_bytes())
            self._build_autotune = state['b_autotune']
            self._build_target_precision = state['b_target_precision']
            self._build_sample_frac = state['b_sample_frac']
            self._distance_method = state['distance_method']
            self._flann_build_params = state['flann_build_params']

        # Load the binary index
        if self._index_elem and not self._index_elem.is_empty():
            # make numpy matrix of descriptor vectors for FLANN
            pts_array = [d.vector() for d in self._descr_cache]
            pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype)
            pyflann.set_distance_type(self._distance_method)
            self._flann = pyflann.FLANN()
            tmp_fp = self._index_elem.write_temp()
            self._flann.load_index(tmp_fp, pts_array)
            self._index_elem.clean_temp()
            del pts_array, tmp_fp

        # Set current PID to the current
        self._pid = multiprocessing.current_process().pid
Example #2
0
    def load_index(self, dir_path):
        """
        Load a saved index state based on the current configuration.

        :raises SimilarityIndexStateLoadError: Could not load index state.

        :param dir_path: Path to the directory to load the index to.
        :type dir_path: str

        """
        self._restore_index()

        if False in (osp.isfile(self._sf_flann_index),
                     osp.isfile(self._sf_state)):
            raise SimilarityIndexStateLoadError("In complete index save state")

        dir_path = osp.abspath(osp.expanduser(dir_path))

        with open(osp.join(dir_path, self._sf_state), 'rb') as f:
            state = cPickle.load(f)

        self._distance_method = state['distance_method']
        self._rand_seed = state['rand_seed']
        self._descr_cache = state['descr_cache']
        self._flann_build_params = state['flann_params']

        pts_array = [d.vector() for d in self._descr_cache]
        pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype)

        pyflann.set_distance_type(self._distance_method)
        self._flann = pyflann.FLANN()
        self._flann.load_index(osp.join(dir_path, self._sf_flann_index),
                               pts_array)
Example #3
0
    def _load_flann_model(self):
        if not self._descr_cache and self._descr_cache_filepath:
            # Load descriptor cache
            # - is copied on fork, so only need to load here.
            self._log.debug("Loading cached descriptors")
            with open(self._descr_cache_filepath, "rb") as f:
                self._descr_cache = cPickle.load(f)

        # Params pickle include the build params + our local state params
        if self._index_param_filepath:
            with open(self._index_param_filepath) as f:
                state = cPickle.load(f)
            self._build_autotune = state["b_autotune"]
            self._build_target_precision = state["b_target_precision"]
            self._build_sample_frac = state["b_sample_frac"]
            self._distance_method = state["distance_method"]
            self._flann_build_params = state["flann_build_params"]

        # Load the binary index
        if self._index_filepath:
            # make numpy matrix of descriptor vectors for FLANN
            pts_array = [d.vector() for d in self._descr_cache]
            pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype)
            pyflann.set_distance_type(self._distance_method)
            self._flann = pyflann.FLANN()
            self._flann.load_index(self._index_filepath, pts_array)
            del pts_array

        # Set current PID to the current
        self._pid = multiprocessing.current_process().pid
Example #4
0
def build_geom_neighbor_graph(geoms, n_neighbors):
    """ Computes the sparse CSR geometrical adjacency matrix gadj

    Parameters
    ----------
    geoms: (n_pts, d) array,
           the geometrical info

    n_neighbors: int,
                 number of neighbors

    Returns
    -------
    gadj: (n_pts, n_pts) sparse CSR array,
          the adjacency matrix
          gadj[i,j] == 1 iff i and j are geometrical neighbors

    Notes
    -----
    gadj might not be symmetric!
    """
    n_pts = geoms.shape[0]
    pyflann.set_distance_type('euclidean')  # squared euclidean actually
    fli = pyflann.FLANN()
    build_params = dict(algorithm='kdtree', num_neighbors=n_neighbors)
    gneighbs, _ = fli.nn(geoms, geoms, **build_params)
    data = np.ones((n_pts, n_neighbors), dtype='u1')
    indptr = np.arange(0, n_pts * n_neighbors + 1, n_neighbors, dtype=int)
    gadj = sparse.csr_matrix(
        (data.ravel(), gneighbs.ravel(), indptr), shape=(n_pts, n_pts))
    return gadj
Example #5
0
    def rank_model(self, pos_ids, neg_ids=()):
        """
        Rank the current model, returning a mapping of element IDs to a
        ranking valuation. This valuation should be a probability in the range
        of [0, 1], where 1.0 is the highest rank and 0.0 is the lowest rank.

        :raises RuntimeError: No current model.

            See implementation for other possible RuntimeError causes.

        :param pos_ids: List of positive data IDs
        :type pos_ids: collections.Iterable of int

        :param neg_ids: List of negative data IDs
        :type neg_ids: collections.Iterable of int

        :return: Mapping of ingest ID to a rank.
        :rtype: dict of (int, float)

        """
        super(NearestNeighbor_HIK_Base, self).rank_model(pos_ids, neg_ids)

        self.log.debug("ND_HIK source exemplars:\n"
                       "Pos: %s\n"
                       "Neg: %s",
                       pos_ids, neg_ids)
        # TODO: add auto-negative selection?

        # Construct / use cached FLANN index from feature data
        pyflann.set_distance_type('cs')  # chi squared
        flann = pyflann.FLANN()
        flann.build_index(self._feature_mat, **{
            "log_level": "info",
        })

        # Find positive/negative centroids
        pos_centroids = self._feature_mat[[self._uid_idx_map[pid]
                                           for pid in pos_ids]]
        #: :type: numpy.core.multiarray.ndarray
        pos_avg_c = pos_centroids.sum(axis=0) / float(len(pos_ids))
        idxs, dists = flann.nn_index(pos_avg_c, self._feature_mat.shape[0])
        pos_dists = numpy.array([v[1] for v in sorted(zip(idxs[0], dists[0]),
                                                      key=lambda e: e[0])])

        if neg_ids:
            neg_centroids = self._feature_mat[[self._uid_idx_map[nid]
                                               for nid in neg_ids]]
            #: :type: numpy.core.multiarray.ndarray
            neg_avg_c = neg_centroids.sum(axis=0) / float(len(neg_ids))
            idxs, dists = flann.nn_index(neg_avg_c, self._feature_mat.shape[0])
            neg_dists = numpy.array([v[1] for v in sorted(zip(idxs[0], dists[0]),
                                                          key=lambda e: e[0])])
            idx_rank = pos_dists / neg_dists
        else:
            idx_rank = pos_dists

        # Constrain to [0,1] range and associate to UIDs
        idx_rank = 1.0 - (idx_rank / idx_rank.max())
        d = dict(zip(self._uid_array, idx_rank))
        return d
Example #6
0
 def __init__(self, update_frequency, send_socket, video_source_name, distance_type = 'euclidean', vision_window_name='VISION', detection_window_name='DETECTION', surf_params=(0, 300, 3, 4), filter=False):
     """Initialize vision system with update frequency, ip adress, and example image"""
     self.logger = logging.getLogger("Borg.Brain.Vision.SurfDetect")
     self.update_frequency = update_frequency
     self.surf_params = surf_params
     self.n_octaves = self.surf_params[2]
     self.names = []
     self.sizes = {}
     self.images = {}
     self.keypoints = []
     self.descriptors = []
     self.displaynames = {}
     self.modelKeypoints = {}
     self.destination = 'default'
     self.vid_mem_reader = util.vidmemreader.VidMemReader([video_source_name])
     self.send_socket = send_socket
     pyflann.set_distance_type(distance_type)
     self.vision_window = vision_window_name
     self.detection_window = detection_window_name
     self.imageSize = ()
     self.filter = filter #If False SURF detections won't be filtered by get_ratio()
     self.reasonable_ratio = True
     self.min_ratio = .05
     self.max_ratio = 3
     self.min_angle = math.radians(70)
     self.max_angle = math.radians(110)
     #Object with the update() function to be called (TODO: Cleanup this code):
     self.update_object = None
Example #7
0
 def build_index(self, index_filename=None):
     tt.tic('build_index')
     pyflann.set_distance_type(self.distance_metric)
     self.flann = pyflann.FLANN()
     if index_filename:
         self.flann.load_index(index_filename, self.hists_reduced)
     else:
         self.params = self.flann.build_index(
             self.hists_reduced, algorithm='autotuned',
             sample_fraction=0.3, target_precision=.8,
             build_weight=0.01, memory_weight=0.)
     print(self.params)
     tt.toc('build_index')
     return self
Example #8
0
    def _restore_index(self):
        """
        If we think we're suppose to have an index, check the recorded PID with
        the current PID, reloading the index from cache if they differ.

        If there is a loaded index and we're on the same process that created it
        this does nothing.
        """
        if self._flann_index_cache and os.path.isfile(self._flann_index_cache) \
                and self._pid != multiprocessing.current_process().pid:
            pyflann.set_distance_type(self._distance_method)
            self._flann = pyflann.FLANN()

            pts_array = [d.vector() for d in self._descr_cache]
            pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype)
            self._flann.load_index(self._flann_index_cache, pts_array)
            self._pid = multiprocessing.current_process().pid
Example #9
0
 def __init__(self, C, Q, cell, outfile, barrier=None):
   assert len(C.params) == len(PARAMS_DEFAULT)
   threading.Thread.__init__(self)
   self.qpath = Q.siftpath
   if type(cell) is list:
     self.cellpath = [os.path.join(C.dbdir, c) for c in cell]
   else:
     self.cellpath = os.path.join(C.dbdir, cell)
   self.infodir = C.infodir
   self.celldir = C.dbdir
   self.outfile = outfile
   self.params = C.params
   self.criteria = C.criteria
   self.barrier = barrier
   self.dump = self.outfile + ('-detailed%s.npy' % DETAIL_VERSION)
   pyflann.set_distance_type(C.params['distance_type'])
   self.reader = reader.get_reader(C.params['descriptor'])
Example #10
0
File: flann.py Project: mrG7/SMQTK
    def _load_flann_model(self):
        # Params pickle include the build params + our local state params
        with open(self._index_param_filepath) as f:
            state = cPickle.load(f)
        self._build_autotune = state['b_autotune']
        self._build_target_precision = state['b_target_precision']
        self._build_sample_frac = state['b_sample_frac']
        self._distance_method = state['distance_method']
        self._flann_build_params = state['flann_build_params']

        # make numpy matrix of descriptor vectors for FLANN
        pts_array = [d.vector() for d in self._descr_cache]
        pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype)
        pyflann.set_distance_type(self._distance_method)
        self._flann = pyflann.FLANN()
        self._flann.load_index(self._index_filepath, pts_array)
        del pts_array

        # Set current PID to the current
        self._pid = multiprocessing.current_process().pid
Example #11
0
def parse_args():
    parser = argparse.ArgumentParser(description='Mostitch!')
    parser.add_argument('--buffsize', default=1024, help='Buffer Size')
    parser.add_argument('--csound', default=False, help='Print Csound Stuff')
    parser.add_argument('--distance', default='euclidean', help='What Distance type to use: euclidean kl manhattan minkowski hik hellinger cs')
    parser.add_argument('--learn', default=False, help='Turn Learning on or Off')
    parser.add_argument('--window', default="hann", help='Which window function to use: hann saw flat triangle')
    parser.add_argument('--mingrains', default=10, help='Minimum number of grains')
    parser.add_argument('--maxgrains', default=100, help='Maximum number of grains')
    parser.add_argument('--topn', default=20, help='Top N from NN')
    parser.add_argument('files', help='Filenames',nargs='+')
    args = parser.parse_args()
    buffsize = int(args.buffsize)
    csound = args.csound
    learning = args.learn
    myfiles = args.files
    pyflann.set_distance_type(args.distance)
    topn = int(args.topn)
    window_name = args.window
    maxgrains = int(args.maxgrains)
    mingrains = int(args.mingrains)
    state = {
	"maxgrains":maxgrains,
	"mingrains":mingrains,
	"amp":0.2,
	"topn":topn,
        "delay":3*buffsize,
        "learning":learning
        }
    settings = {
       "files":myfiles,
       "window_name":window_name,
       "csound":csound,
       "state":state,
       "buffsize":buffsize
       }
    return settings
Example #12
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptors data elements.

        Subsequent calls to this method should rebuild the index, not add to it.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the old
                cache away.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptors elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        # Not caring about restoring the index because we're just making a new
        # one
        self._log.info("Building new FLANN index")

        self._log.debug("Storing descriptors")
        self._descr_cache = list(descriptors)
        if not self._descr_cache:
            raise ValueError("No data provided in given iterable.")
        # Cache descriptors if we have a path
        if self._descr_cache_filepath:
            self._log.debug("Caching descriptors: %s", self._descr_cache_filepath)
            safe_create_dir(osp.dirname(self._descr_cache_filepath))
            with open(self._descr_cache_filepath, "wb") as f:
                cPickle.dump(self._descr_cache, f)

        params = {
            "target_precision": self._build_target_precision,
            "sample_fraction": self._build_sample_frac,
            "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning"),
        }
        if self._build_autotune:
            params["algorithm"] = "autotuned"
        if self._rand_seed is not None:
            params["random_seed"] = self._rand_seed
        pyflann.set_distance_type(self._distance_method)

        self._log.debug("Accumulating descriptor vectors into matrix for FLANN")
        pts_array = [d.vector() for d in self._descr_cache]
        pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype)
        self._flann = pyflann.FLANN()
        self._flann_build_params = self._flann.build_index(pts_array, **params)
        del pts_array

        self._log.debug("Caching index and state: %s, %s", self._index_filepath, self._index_param_filepath)
        if self._index_filepath:
            self._log.debug("Caching index: %s", self._index_filepath)
            safe_create_dir(osp.dirname(self._index_filepath))
            self._flann.save_index(self._index_filepath)
        if self._index_param_filepath:
            self._log.debug("Caching index params: %s", self._index_param_filepath)
            state = {
                "b_autotune": self._build_autotune,
                "b_target_precision": self._build_target_precision,
                "b_sample_frac": self._build_sample_frac,
                "distance_method": self._distance_method,
                "flann_build_params": self._flann_build_params,
            }
            safe_create_dir(osp.dirname(self._index_param_filepath))
            with open(self._index_param_filepath, "w") as f:
                cPickle.dump(state, f)

        self._pid = multiprocessing.current_process().pid
Example #13
0
    def generate_model(self, data_set):
        """
        Generate this feature detector's data-model given a file ingest. This
        saves the generated model to the currently configured data directory.

        For colorDescriptor, we generate raw features over the ingest data,
        compute a codebook via kmeans, and then create an index with FLANN via
        the "autotune" or linear algorithm to intelligently pick the fastest
        indexing method.

        :param data_set: Set of input data elements to generate the model
            with.
        :type data_set: collections.Set[smqtk.representation.DataElement]

        """
        if self.has_model:
            self._log.warn("ColorDescriptor model for descriptor type '%s' "
                           "already generated!", self.descriptor_type())
            return

        # Check that input data is value for processing through colorDescriptor
        valid_types = self.valid_content_types()
        invalid_types_found = set()
        for di in data_set:
            if di.content_type() not in valid_types:
                invalid_types_found.add(di.content_type())
        if invalid_types_found:
            self._log.error("Found one or more invalid content types among "
                            "input:")
            for t in sorted(invalid_types_found):
                self._log.error("\t- '%s", t)
            raise ValueError("Discovered invalid content type among input "
                             "data: %s" % sorted(invalid_types_found))

        if not osp.isfile(self.codebook_filepath):
            self._log.info("Did not find existing ColorDescriptor codebook for "
                           "descriptor '%s'.", self.descriptor_type())

            # generate descriptors
            with SimpleTimer("Generating descriptor matrices...",
                             self._log.info):
                descriptors_checkpoint = osp.join(self._work_dir,
                                                  "model_descriptors.npy")

                if osp.isfile(descriptors_checkpoint):
                    self._log.debug("Found existing computed descriptors work "
                                    "file for model generation.")
                    descriptors = numpy.load(descriptors_checkpoint)
                else:
                    self._log.debug("Computing model descriptors")
                    _, descriptors = \
                        self._generate_descriptor_matrices(
                            data_set,
                            limit=self._model_gen_descriptor_limit
                        )
                    _, tmp = tempfile.mkstemp(dir=self._work_dir,
                                              suffix='.npy')
                    self._log.debug("Saving model-gen info/descriptor matrix")
                    numpy.save(tmp, descriptors)
                    os.rename(tmp, descriptors_checkpoint)

            # Compute centroids (codebook) with kmeans
            with SimpleTimer("Computing sklearn.cluster.MiniBatchKMeans...",
                             self._log.info):
                kmeans_verbose = self._log.getEffectiveLevel <= logging.DEBUG
                kmeans = sklearn.cluster.MiniBatchKMeans(
                    n_clusters=self._kmeans_k,
                    init_size=self._kmeans_k*3,
                    random_state=self._rand_seed,
                    verbose=kmeans_verbose,
                    compute_labels=False,
                )
                kmeans.fit(descriptors)
                codebook = kmeans.cluster_centers_
            with SimpleTimer("Saving generated codebook...", self._log.debug):
                numpy.save(self.codebook_filepath, codebook)
        else:
            self._log.info("Found existing codebook file.")
            codebook = numpy.load(self.codebook_filepath)

        # create FLANN index
        # - autotune will force select linear search if there are < 1000 words
        #   in the codebook vocabulary.
        pyflann.set_distance_type(self._flann_distance_metric)
        flann = pyflann.FLANN()
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            log_level = 'info'
        else:
            log_level = 'warning'
        with SimpleTimer("Building FLANN index...", self._log.info):
            p = {
                "target_precision": self._flann_target_precision,
                "sample_fraction": self._flann_sample_fraction,
                "log_level": log_level,
            }
            if self._flann_autotune:
                p['algorithm'] = "autotuned"
            if self._rand_seed is not None:
                p['random_seed'] = self._rand_seed
            flann_params = flann.build_index(codebook, **p)
        with SimpleTimer("Saving FLANN index to file...", self._log.debug):
            # Save FLANN index data binary
            flann.save_index(self.flann_index_filepath)
            # Save out log of parameters
            with open(self.flann_params_filepath, 'w') as ofile:
                json.dump(flann_params, ofile, indent=4, sort_keys=True)

        # save generation results to class for immediate feature computation use
        self._codebook = codebook
Example #14
0
    def compute_feature(self, data, no_checkpoint=False):
        """
        Given some kind of data, process and return a feature vector as a Numpy
        array.

        :raises RuntimeError: Feature extraction failure of some kind.

        :param data: Some kind of input data for the feature descriptor. This is
            descriptor dependent.
        :type data:
            SMQTK.utils.DataFile.DataFile or SMQTK.utils.VideoFile.VideoFile

        :param no_checkpoint: Normally, we produce a checkpoint file, which
            contains the numpy feature vector for a given video so that it may
            be loaded instead of re-computed if the same video is visited again.
            If this is True, we do not save such a file to our work directory.

        :return: Feature vector. This is a histogram of N bins where N is the
            number of centroids in the codebook. Bin values is percent
            composition, not absolute counts.
        :rtype: numpy.ndarray

        """
        checkpoint_filepath = self._get_checkpoint_feature_file(data)
        if osp.isfile(checkpoint_filepath):
            # self.log.debug("Found checkpoint feature vector file, loading and "
            #                "returning.")
            return numpy.load(checkpoint_filepath)

        if not self.has_model:
            raise RuntimeError("No model currently loaded! Check the existence "
                               "or, or generate, model files!\n"
                               "Codebook path: %s\n"
                               "FLANN Index path: %s"
                               % (self.codebook_filepath,
                                  self.flann_index_filepath))

        self.log.debug("Computing descriptors...")
        info, descriptors = self._generate_descriptor_matrices(data)

        # Quantization
        # - loaded the model at class initialization if we had one
        self.log.debug("Quantizing descriptors")
        pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION)
        flann = pyflann.FLANN()
        flann.load_index(self.flann_index_filepath, self._codebook)
        idxs, dists = flann.nn_index(descriptors)

        # Create histogram
        # - Using explicit bin slots to prevent numpy from automatically
        #   creating tightly constrained bins. This would otherwise cause
        #   histograms between two inputs to be non-comparable (unaligned bins).
        # - See numpy note about ``bins`` to understand why the +1 is necessary
        h, b = numpy.histogram(idxs,  # indices are all integers
                               bins=numpy.arange(self._codebook.shape[0] + 1))
        # self.log.debug("Quantization histogram: %s", h)
        # Normalize histogram into relative frequencies
        # - Not using /= on purpose. h is originally int32 coming out of
        #   histogram. /= would keep int32 type when we want it to be
        #   transformed into a float type by the division.
        if h.sum():
            h = h / float(h.sum())
        else:
            h = numpy.zeros(h.shape, h.dtype)
        # self.log.debug("Normalized histogram: %s", h)

        if not no_checkpoint:
            self.log.debug("Saving checkpoint feature file")
            if not osp.isdir(osp.dirname(checkpoint_filepath)):
                safe_create_dir(osp.dirname(checkpoint_filepath))
            numpy.save(checkpoint_filepath, h)

        return h
Example #15
0
    def generate_model(self, data_list, parallel=None, **kwargs):
        """
        Generate this feature detector's data-model given a file ingest. This
        saves the generated model to the currently configured data directory.

        For colorDescriptor, we generate raw features over the ingest data,
        compute a codebook via kmeans, and then create an index with FLANN via
        the "autotune" algorithm to intelligently pick the fastest indexing
        method.

        :param data_list: List of input data elements to generate model with.
        :type data_list: list of SMQTK.utils.DataFile.DataFile
            or tuple of SMQTK.utils.DataFile.DataFile

        :param parallel: Optionally specification of how many processors to use
            when pooling sub-tasks. If None, we attempt to use all available
            cores.
        :type parallel: int


        Additional optional key-word arguments
        ======================================

        :param kmeans_k: Centroids to generate. Default of 1024
        :type kmeans_k: int

        :param kmeans_iter: Number of times to run the kmeans algorithms, using
            the centroids from the best run. Default of 5.
        :type kmeans_iter: int

        :param kmeans_threshold: Distortion difference termination threshold.
            KMeans algorithm terminates during a run if the centroid distortion
            since the last iteration is less than this threshold. Default of
            1e-5.
        :type kmeans_threshold: float

        :param flann_target_precision: Target precision percent to tune index
            for. Default is 0.99 (99% accuracy).
        :type flann_target_precision: float

        :param flann_sample_fraction: Fraction of input data to use for index
            auto tuning. Default is 1.0 (100%).
        :type flann_sample_fraction: float

        """
        if self.has_model:
            self.log.warn("ColorDescriptor model for descriptor type '%s' "
                          "already generated!", self.descriptor_type())
            return

        pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION)
        flann = pyflann.FLANN()

        if not osp.isfile(self.codebook_filepath):
            self.log.info("Did not find existing ColorDescriptor codebook for "
                          "descriptor '%s'.", self.descriptor_type())

            # generate descriptors
            with SimpleTimer("Generating descriptor matrices...", self.log.debug):
                descriptors_checkpoint = osp.join(self.work_directory,
                                                  "model_descriptors.npy")

                if osp.isfile(descriptors_checkpoint):
                    self.log.debug("Found existing computed descriptors work "
                                   "file for model generation.")
                    descriptors = numpy.load(descriptors_checkpoint)
                else:
                    self.log.debug("Computing model descriptors")
                    _, descriptors = \
                        self._generate_descriptor_matrices(
                            *data_list,
                            limit=self.CODEBOOK_DESCRIPTOR_LIMIT
                        )
                    _, tmp = tempfile.mkstemp(dir=self.work_directory,
                                              suffix='.npy')
                    self.log.debug("Saving model-gen info/descriptor matrix")
                    numpy.save(tmp, descriptors)
                    os.rename(tmp, descriptors_checkpoint)

            # Compute centroids (codebook) with kmeans
            # - NOT performing whitening, as this transforms the feature space
            #   in such a way that newly computed features cannot be applied to
            #   the generated codebook as the same exact whitening
            #   transformation would need to be applied in order for the
            #   comparison to the codebook centroids to be valid.
            # - Alternate kmeans implementations: OpenCV, sklearn, pyflann
            with SimpleTimer("Computing scipy.cluster.vq.kmeans...",
                             self.log.debug):
                codebook, distortion = scipy.cluster.vq.kmeans(
                    descriptors,
                    kwargs.get('kmeans_k', 1024),
                    kwargs.get('kmeans_iter', 5),
                    kwargs.get('kmeans_threshold', 1e-5)
                )
                self.log.debug("KMeans result distortion: %f", distortion)
            # with SimpleTimer("Computing pyflann.FLANN.hierarchical_kmeans...",
            #                  self.log.debug):
            #     # results in 1009 clusters (should, anyway, given the
            #     # function's comment)
            #     codebook2 = flann.hierarchical_kmeans(descriptors, 64, 16, 5)
            with SimpleTimer("Saving generated codebook...", self.log.debug):
                numpy.save(self.codebook_filepath, codebook)
        else:
            self.log.info("Found existing codebook file.")
            codebook = numpy.load(self.codebook_filepath)

        # create FLANN index
        # - autotune will force select linear search if there are < 1000 words
        #   in the codebook vocabulary.
        if self.log.getEffectiveLevel() <= logging.DEBUG:
            log_level = 'info'
        else:
            log_level = 'warning'
        with SimpleTimer("Building FLANN index...", self.log.debug):
            params = flann.build_index(codebook, **{
                "target_precision": kwargs.get("flann_target_precision", 0.99),
                "sample_fraction": kwargs.get("flann_sample_fraction", 1.0),
                "log_level": log_level,
                "algorithm": "autotuned"
            })
            # TODO: Save params dict as JSON?
        with SimpleTimer("Saving FLANN index to file...", self.log.debug):
            flann.save_index(self.flann_index_filepath)

        # save generation results to class for immediate feature computation use
        self._codebook = codebook
Example #16
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptors data elements.

        Subsequent calls to this method should rebuild the index, not add to it.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the old
                cache away.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptors elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        # Not caring about restoring the index because we're just making a new
        # one
        self._log.info("Building new FLANN index")

        self._log.debug("Storing descriptors")
        self._descr_cache = list(descriptors)
        if not self._descr_cache:
            raise ValueError("No data provided in given iterable.")
        # Cache descriptors if we have an element
        if self._descr_cache_elem and self._descr_cache_elem.writable():
            self._log.debug("Caching descriptors: %s", self._descr_cache_elem)
            self._descr_cache_elem.set_bytes(
                cPickle.dumps(self._descr_cache, -1))

        params = {
            "target_precision":
            self._build_target_precision,
            "sample_fraction":
            self._build_sample_frac,
            "log_level":
            ("info"
             if self._log.getEffectiveLevel() <= logging.DEBUG else "warning")
        }
        if self._build_autotune:
            params['algorithm'] = "autotuned"
        if self._rand_seed is not None:
            params['random_seed'] = self._rand_seed
        pyflann.set_distance_type(self._distance_method)

        self._log.debug(
            "Accumulating descriptor vectors into matrix for FLANN")
        pts_array = elements_to_matrix(self._descr_cache, report_interval=1.0)

        self._log.debug('Building FLANN index')
        self._flann = pyflann.FLANN()
        self._flann_build_params = self._flann.build_index(pts_array, **params)
        del pts_array

        if self._index_elem and self._index_elem.writable():
            self._log.debug("Caching index: %s", self._index_elem)
            # FLANN wants to write to a file, so make a temp file, then read it
            # in, putting bytes into element.
            fd, fp = tempfile.mkstemp()
            try:
                self._flann.save_index(fp)
                self._index_elem.set_bytes(os.read(fd, os.path.getsize(fp)))
            finally:
                os.close(fd)
                os.remove(fp)
        if self._index_param_elem and self._index_param_elem.writable():
            self._log.debug("Caching index params: %s", self._index_param_elem)
            state = {
                'b_autotune': self._build_autotune,
                'b_target_precision': self._build_target_precision,
                'b_sample_frac': self._build_sample_frac,
                'distance_method': self._distance_method,
                'flann_build_params': self._flann_build_params,
            }
            self._index_param_elem.set_bytes(cPickle.dumps(state, -1))

        self._pid = multiprocessing.current_process().pid
Example #17
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptors data elements.

        Subsequent calls to this method should rebuild the index, not add to it.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the old
                cache away.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptors elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        # Not caring about restoring the index because we're just making a new
        # one
        self._log.info("Building new FLANN index")

        self._log.debug("Storing descriptors")
        self._descr_cache = list(descriptors)
        if not self._descr_cache:
            raise ValueError("No data provided in given iterable.")
        # Cache descriptors if we have a path
        if self._descr_cache_filepath:
            self._log.debug("Caching descriptors: %s",
                            self._descr_cache_filepath)
            safe_create_dir(osp.dirname(self._descr_cache_filepath))
            with open(self._descr_cache_filepath, 'wb') as f:
                cPickle.dump(self._descr_cache, f)

        params = {
            "target_precision": self._build_target_precision,
            "sample_fraction": self._build_sample_frac,
            "log_level": ("info"
                          if self._log.getEffectiveLevel() <= logging.DEBUG
                          else "warning")
        }
        if self._build_autotune:
            params['algorithm'] = "autotuned"
        if self._rand_seed is not None:
            params['random_seed'] = self._rand_seed
        pyflann.set_distance_type(self._distance_method)

        self._log.debug("Accumulating descriptor vectors into matrix for FLANN")
        pts_array = [d.vector() for d in self._descr_cache]
        pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype)
        self._flann = pyflann.FLANN()
        self._flann_build_params = self._flann.build_index(pts_array, **params)
        del pts_array

        self._log.debug("Caching index and state: %s, %s",
                        self._index_filepath, self._index_param_filepath)
        if self._index_filepath:
            self._log.debug("Caching index: %s", self._index_filepath)
            safe_create_dir(osp.dirname(self._index_filepath))
            self._flann.save_index(self._index_filepath)
        if self._index_param_filepath:
            self._log.debug("Caching index params: %s",
                            self._index_param_filepath)
            state = {
                'b_autotune': self._build_autotune,
                'b_target_precision': self._build_target_precision,
                'b_sample_frac': self._build_sample_frac,
                'distance_method': self._distance_method,
                'flann_build_params': self._flann_build_params,
            }
            safe_create_dir(osp.dirname(self._index_param_filepath))
            with open(self._index_param_filepath, 'w') as f:
                cPickle.dump(state, f)

        self._pid = multiprocessing.current_process().pid
Example #18
0
    def _build_index(self, descriptors: Iterable[DescriptorElement]) -> None:
        """
        Internal method to be implemented by sub-classes to build the index
        with the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the
                old cache away.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.abc.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            # Not caring about restoring the index because we're just making a
            # new one.
            LOG.info("Building new FLANN index")

            LOG.debug("Caching descriptor elements")
            self._descr_cache = list(descriptors)
            # Cache descriptors if we have an element
            if self._descr_cache_elem and self._descr_cache_elem.writable():
                LOG.debug(f"Caching descriptors: {self._descr_cache_elem}")
                self._descr_cache_elem.set_bytes(
                    pickle.dumps(self._descr_cache, -1))

            params = {
                "target_precision":
                self._build_target_precision,
                "sample_fraction":
                self._build_sample_frac,
                "log_level":
                ("info"
                 if LOG.getEffectiveLevel() <= logging.DEBUG else "warning")
            }
            if self._build_autotune:
                params['algorithm'] = "autotuned"
            if self._rand_seed is not None:
                params['random_seed'] = self._rand_seed
            pyflann.set_distance_type(self._distance_method)

            LOG.debug("Accumulating descriptor vectors into matrix for FLANN")
            pts_array = numpy.asarray(
                list(parallel_map(lambda d_: d_.vector(), self._descr_cache)))

            LOG.debug('Building FLANN index')
            self._flann = pyflann.FLANN()
            self._flann_build_params = self._flann.build_index(
                pts_array, **params)
            del pts_array

            if self._index_elem and self._index_elem.writable():
                LOG.debug("Caching index: %s", self._index_elem)
                # FLANN wants to write to a file, so make a temp file, then
                # read it in, putting bytes into element.
                fd, fp = tempfile.mkstemp()
                try:
                    self._flann.save_index(fp)
                    # Use the file descriptor to create the file object.
                    # This avoids reopening the file and will automatically
                    # close the file descriptor on exiting the with block.
                    # fdopen() is required because in Python 2 open() does
                    # not accept a file descriptor.
                    with os.fdopen(fd, 'rb') as f:
                        self._index_elem.set_bytes(f.read())
                finally:
                    os.remove(fp)
            if self._index_param_elem and self._index_param_elem.writable():
                LOG.debug(f"Caching index params: {self._index_param_elem}")
                state = {
                    'b_autotune': self._build_autotune,
                    'b_target_precision': self._build_target_precision,
                    'b_sample_frac': self._build_sample_frac,
                    'distance_method': self._distance_method,
                    'flann_build_params': self._flann_build_params,
                }
                self._index_param_elem.set_bytes(pickle.dumps(state, -1))

            self._pid = multiprocessing.current_process().pid
Example #19
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptors data elements.

        Subsequent calls to this method should rebuild the index, not add to it.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the old
                cache away.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptors elements to build index over.
        :type descriptors: collections.Iterable[smqtk.data_rep.DescriptorElement]

        """
        # If there is already an index, clear the cache file if we are in the
        # same process that created our current index.
        if self._flann_index_cache and os.path.isfile(self._flann_index_cache) \
                and self._pid == multiprocessing.current_process().pid:
            self._log.debug('removing old index cache file')
            os.remove(self._flann_index_cache)

        self._log.debug("Building new index")

        # Compute descriptors for data elements
        self._log.debug("Computing descriptors for data")
        # uid2vec = \
        #     self._content_descriptor.compute_descriptor_async(data)
        # Translate returned mapping into cache lists
        self._descr_cache = [d for d in sorted(descriptors,
                                               key=lambda e: e.uuid())]
        if not self._descr_cache:
            raise ValueError("No data provided in given iterable.")

        # numpy array version for FLANN
        pts_array = [d.vector() for d in self._descr_cache]
        pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype)

        # Reset PID/FLANN/saved cache
        self._pid = multiprocessing.current_process().pid
        safe_create_dir(self._temp_dir)
        fd, self._flann_index_cache = tempfile.mkstemp(".flann",
                                                       dir=self._temp_dir)
        os.close(fd)
        self._log.debug("Building FLANN index")
        params = {
            "algorithm": self._build_autotune,
            "target_precision": self._build_target_precision,
            "sample_fraction": self._build_sample_frac,
            "log_level": ("info"
                          if self._log.getEffectiveLevel() <= logging.DEBUG
                          else "warn")
        }
        if self._rand_seed is not None:
            params['random_seed'] = self._rand_seed
        pyflann.set_distance_type(self._distance_method)
        self._flann = pyflann.FLANN()
        self._flann_build_params = self._flann.build_index(pts_array, **params)

        # Saving out index cache
        self._log.debug("Saving index to cache file: %s",
                        self._flann_index_cache)
        self._flann.save_index(self._flann_index_cache)
Example #20
0
def tsdf2chamfer_fortex2shape(target_v, tsdf_gt, param_reg, param_gt,
                              savepath1, savepath2):

    # Generate deformed coarse model
    voxelpath = "./models/TshapeCoarseTetraD.ply"
    v_mesh = pymesh.load_mesh(voxelpath)

    VoxCnt = v_mesh.num_voxels

    voxelArray = np.array(v_mesh.voxels)
    vertexArray = np.array(v_mesh.vertices, dtype=np.float32)
    vertexvoxelArray = vertexArray[voxelArray].reshape(
        voxelArray.shape[0], 1, 12)[:, 0].astype(np.float32)

    weights = np.load(
        './models/coarseweights.npy'
    )  #weights of the coarse tetrahedra volume interpolated by SMPL body
    J = np.load('./models/Tshapecoarsejoints.npy')
    J_shapedir = np.load("./models/J_shapedir.npy"
                         )  # J_shapedir (can deform joints depend on 10 betas)

    # Deform coarse model
    kintree_table = np.array([[
        4294967295, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 12, 13, 14, 16,
        17, 18, 19, 20, 21
    ],
                              [
                                  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23
                              ]])

    vertexArrayD_gt, jointArrayD_gt = Warp.warpVolume(vertexArray, J,
                                                      J_shapedir,
                                                      param_gt["pose"],
                                                      param_gt["betas"],
                                                      kintree_table, weights)

    Size = np.array([1, 4, VoxCnt])
    GPUManager = GPU.GPUManager()

    MarchingTetrahedra = My_MT.MarchingTetrahedra(Size, tsdf_gt, 0.0,
                                                  vertexvoxelArray, voxelArray,
                                                  vertexArrayD_gt, GPUManager)
    Vertices_gt = MarchingTetrahedra.run_CPU(tsdf_gt, 0.0, voxelArray,
                                             vertexArrayD_gt)
    MarchingTetrahedra.SaveToPly_CPU(savepath2)

    pyflann.set_distance_type("euclidean")
    flann = FLANN()
    flann.build_index(Vertices_gt,
                      algorithm='kmeans',
                      centers_init='kmeanspp',
                      random_seed=1984)
    vertIds, dists = flann.nn_index(target_v, num_neighbors=1)
    chamfer_pyflann = np.average(dists)

    sourcepc = o3d.geometry.PointCloud()
    sourcepc.points = o3d.utility.Vector3dVector(Vertices_gt)
    targetpc = o3d.geometry.PointCloud()
    targetpc.points = o3d.utility.Vector3dVector(target_v)

    dists = np.array(targetpc.compute_point_cloud_distance(sourcepc))
    chamfer_o3d = np.average(dists)

    return chamfer_o3d, chamfer_pyflann
Example #21
0
    def _compute_descriptor(self, data):
        """
        Given some kind of data, process and return a feature vector as a Numpy
        array.

        :raises RuntimeError: Feature extraction failure of some kind.

        :param data: Some kind of input data for the feature descriptor. This is
            descriptor dependent.
        :type data: smqtk.representation.DataElement

        :return: Feature vector. This is a histogram of N bins where N is the
            number of centroids in the codebook. Bin values is percent
            composition, not absolute counts.
        :rtype: numpy.ndarray

        """
        super(ColorDescriptor_Base, self)._compute_descriptor(data)

        checkpoint_filepath = self._get_checkpoint_feature_file(data)
        # if osp.isfile(checkpoint_filepath):
        #     return numpy.load(checkpoint_filepath)

        if not self.has_model:
            raise RuntimeError("No model currently loaded! Check the existence "
                               "or, or generate, model files!\n"
                               "Codebook path: %s\n"
                               "FLANN Index path: %s"
                               % (self.codebook_filepath,
                                  self.flann_index_filepath))

        self._log.debug("Computing descriptors for data UID[%s]...",
                        data.uuid())
        info, descriptors = self._generate_descriptor_matrices({data})

        # Load FLANN components
        pyflann.set_distance_type(self._flann_distance_metric)
        flann = pyflann.FLANN()
        flann.load_index(self.flann_index_filepath, self._codebook)

        if not self._use_sp:
            ###
            # Codebook Quantization
            #
            # - loaded the model at class initialization if we had one
            self._log.debug("Quantizing descriptors")

            try:
                # If the distance method is HIK, we need to treat it special
                # since that method produces a similarity score, not a distance
                # score.
                #
                if self._flann_distance_metric == 'hik':
                    # This searches for all NN instead of minimum between n and
                    # the number of descriptors and keeps the last one because
                    # hik is a similarity score and not a distance, which is
                    # also why the values in dists is flipped below.
                    #: :type: numpy.core.multiarray.ndarray, numpy.core.multiarray.ndarray
                    idxs = flann.nn_index(descriptors,
                                          self._codebook.shape[0])[0]
                    # Only keep the last index for each descriptor return
                    idxs = numpy.array([i_array[-1] for i_array in idxs])
                else:
                    # :type: numpy.core.multiarray.ndarray, numpy.core.multiarray.ndarray
                    idxs = flann.nn_index(descriptors, 1)[0]
            except AssertionError:

                self._log.error("Codebook shape  : %s", self._codebook.shape)
                self._log.error("Descriptor shape: %s", descriptors.shape)

                raise

            # Create histogram
            # - Using explicit bin slots to prevent numpy from automatically
            #   creating tightly constrained bins. This would otherwise cause
            #   histograms between two inputs to be non-comparable (unaligned
            #   bins).
            # - See numpy note about ``bins`` to understand why the +1 is
            #   necessary
            # - Learned from spatial implementation that we could feed multiple
            #   neighbors per descriptor into here, leading to a more populated
            #   histogram.
            #   - Could also possibly weight things based on dist from
            #     descriptor?
            #: :type: numpy.core.multiarray.ndarray
            h = numpy.histogram(idxs,  # indices are all integers
                                bins=numpy.arange(self._codebook.shape[0]+1))[0]
            # self._log.debug("Quantization histogram: %s", h)
            # Normalize histogram into relative frequencies
            # - Not using /= on purpose. h is originally int32 coming out of
            #   histogram. /= would keep int32 type when we want it to be
            #   transformed into a float type by the division.
            if h.sum():
                # noinspection PyAugmentAssignment
                h = h / float(h.sum())
            else:
                h = numpy.zeros(h.shape, h.dtype)
            # self._log.debug("Normalized histogram: %s", h)

        else:
            ###
            # Spatial Pyramid Quantization
            #
            self._log.debug("Quantizing descriptors using spatial pyramid")
            ##
            # Quantization factor - number of nearest codes to be saved
            q_factor = 10
            ##
            # Concatenating spatial information to descriptor vectors to format:
            #   [ x y <descriptor> ]
            self._log.debug("Creating combined descriptor matrix")
            m = numpy.concatenate((info[:, :2],
                                   descriptors), axis=1)
            ##
            # Creating quantized vectors, consisting vector:
            #   [ x y c_1 ... c_qf dist_1 ... dist_qf ]
            # which has a total size of 2+(qf*2)
            #
            # Sangmin's code included the distances in the quantized vector, but
            # then also passed this vector into numpy's histogram function with
            # integral bins, causing the [0,1] to be heavily populated, which
            # doesn't make sense to do.
            #   idxs, dists = flann.nn_index(m[:, 2:], q_factor)
            #   q = numpy.concatenate([m[:, :2], idxs, dists], axis=1)
            self._log.debug("Computing nearest neighbors")
            if self._flann_distance_metric == 'hik':
                # Query full ordering of code indices
                idxs = flann.nn_index(m[:, 2:], self._codebook.shape[0])[0]
                # Extract the right-side block for use in building histogram
                # Order doesn't actually matter in the current implementation
                #   because index relative position is not being weighted.
                idxs = idxs[:, -q_factor:]
            else:
                idxs = flann.nn_index(m[:, 2:], q_factor)[0]
            self._log.debug("Creating quantization matrix")
            # This matrix consists of descriptor (x,y) position + near code
            #   indices.
            q = numpy.concatenate([m[:, :2], idxs], axis=1)
            ##
            # Build spatial pyramid from quantized matrix
            self._log.debug("Building spatial pyramid histograms")
            hist_sp = self._build_sp_hist(q, self._codebook.shape[0])
            ##
            # Combine each quadrants into single vector
            self._log.debug("Combining global+thirds into final histogram.")
            f = sys.float_info.min  # so as we don't div by 0 accidentally

            def rf_norm(hist):
                return hist / (float(hist.sum()) + f)
            h = numpy.concatenate([rf_norm(hist_sp[0]),
                                   rf_norm(hist_sp[5]),
                                   rf_norm(hist_sp[6]),
                                   rf_norm(hist_sp[7])],
                                  axis=1)
            # noinspection PyAugmentAssignment
            h /= h.sum()

        self._log.debug("Saving checkpoint feature file")
        if not osp.isdir(osp.dirname(checkpoint_filepath)):
            file_utils.safe_create_dir(osp.dirname(checkpoint_filepath))
        numpy.save(checkpoint_filepath, h)

        return h
Example #22
0
    def generate_model(self, data_set, **kwargs):
        """
        Generate this feature detector's data-model given a file ingest. This
        saves the generated model to the currently configured data directory.

        For colorDescriptor, we generate raw features over the ingest data,
        compute a codebook via kmeans, and then create an index with FLANN via
        the "autotune" or linear algorithm to intelligently pick the fastest
        indexing method.

        :param data_set: Set of input data elements to generate the model
            with.
        :type data_set: collections.Set[smqtk.representation.DataElement]

        """
        if self.has_model:
            self._log.warn("ColorDescriptor model for descriptor type '%s' "
                           "already generated!", self.descriptor_type())
            return

        # Check that input data is value for processing through colorDescriptor
        valid_types = self.valid_content_types()
        invalid_types_found = set()
        for di in data_set:
            if di.content_type() not in valid_types:
                invalid_types_found.add(di.content_type())
        if invalid_types_found:
            self._log.error("Found one or more invalid content types among "
                            "input:")
            for t in sorted(invalid_types_found):
                self._log.error("\t- '%s", t)
            raise ValueError("Discovered invalid content type among input "
                             "data: %s" % sorted(invalid_types_found))

        if not osp.isfile(self.codebook_filepath):
            self._log.info("Did not find existing ColorDescriptor codebook for "
                           "descriptor '%s'.", self.descriptor_type())

            # generate descriptors
            with SimpleTimer("Generating descriptor matrices...",
                             self._log.info):
                descriptors_checkpoint = osp.join(self._work_dir,
                                                  "model_descriptors.npy")

                if osp.isfile(descriptors_checkpoint):
                    self._log.debug("Found existing computed descriptors work "
                                    "file for model generation.")
                    descriptors = numpy.load(descriptors_checkpoint)
                else:
                    self._log.debug("Computing model descriptors")
                    _, descriptors = \
                        self._generate_descriptor_matrices(
                            data_set,
                            limit=self._model_gen_descriptor_limit
                        )
                    _, tmp = tempfile.mkstemp(dir=self._work_dir,
                                              suffix='.npy')
                    self._log.debug("Saving model-gen info/descriptor matrix")
                    numpy.save(tmp, descriptors)
                    os.rename(tmp, descriptors_checkpoint)

            # Compute centroids (codebook) with kmeans
            with SimpleTimer("Computing sklearn.cluster.MiniBatchKMeans...",
                             self._log.info):
                kmeans_verbose = self._log.getEffectiveLevel <= logging.DEBUG
                kmeans = sklearn.cluster.MiniBatchKMeans(
                    n_clusters=self._kmeans_k,
                    init_size=self._kmeans_k*3,
                    random_state=self._rand_seed,
                    verbose=kmeans_verbose,
                    compute_labels=False,
                )
                kmeans.fit(descriptors)
                codebook = kmeans.cluster_centers_
            with SimpleTimer("Saving generated codebook...", self._log.debug):
                numpy.save(self.codebook_filepath, codebook)
        else:
            self._log.info("Found existing codebook file.")
            codebook = numpy.load(self.codebook_filepath)

        # create FLANN index
        # - autotune will force select linear search if there are < 1000 words
        #   in the codebook vocabulary.
        pyflann.set_distance_type(self._flann_distance_metric)
        flann = pyflann.FLANN()
        if self._log.getEffectiveLevel() <= logging.DEBUG:
            log_level = 'info'
        else:
            log_level = 'warning'
        with SimpleTimer("Building FLANN index...", self._log.info):
            p = {
                "target_precision": self._flann_target_precision,
                "sample_fraction": self._flann_sample_fraction,
                "log_level": log_level,
            }
            if self._flann_autotune:
                p['algorithm'] = "autotuned"
            if self._rand_seed is not None:
                p['random_seed'] = self._rand_seed
            flann_params = flann.build_index(codebook, **p)
        with SimpleTimer("Saving FLANN index to file...", self._log.debug):
            # Save FLANN index data binary
            flann.save_index(self.flann_index_filepath)
            # Save out log of parameters
            with open(self.flann_params_filepath, 'w') as ofile:
                json.dump(flann_params, ofile, indent=4, sort_keys=True)

        # save generation results to class for immediate feature computation use
        self._codebook = codebook
Example #23
0
    def _compute_descriptor(self, data):
        """
        Given some kind of data, process and return a feature vector as a Numpy
        array.

        :raises RuntimeError: Feature extraction failure of some kind.

        :param data: Some kind of input data for the feature descriptor. This is
            descriptor dependent.
        :type data: smqtk.data_rep.DataElement

        :return: Feature vector. This is a histogram of N bins where N is the
            number of centroids in the codebook. Bin values is percent
            composition, not absolute counts.
        :rtype: numpy.ndarray

        """
        super(ColorDescriptor_Base, self)._compute_descriptor(data)

        checkpoint_filepath = self._get_checkpoint_feature_file(data)
        # if osp.isfile(checkpoint_filepath):
        #     return numpy.load(checkpoint_filepath)

        if not self.has_model:
            raise RuntimeError("No model currently loaded! Check the existence "
                               "or, or generate, model files!\n"
                               "Codebook path: %s\n"
                               "FLANN Index path: %s"
                               % (self.codebook_filepath,
                                  self.flann_index_filepath))

        self.log.debug("Computing descriptors for data UID[%s]...", data.uuid())
        info, descriptors = self._generate_descriptor_matrices({data})

        if not self._use_sp:
            ###
            # Codebook Quantization
            #
            # - loaded the model at class initialization if we had one
            self.log.debug("Quantizing descriptors")
            pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION)
            flann = pyflann.FLANN()
            flann.load_index(self.flann_index_filepath, self._codebook)
            try:
                idxs, dists = flann.nn_index(descriptors)
            except AssertionError:

                self.log.error("Codebook shape  : %s", self._codebook.shape)
                self.log.error("Descriptor shape: %s", descriptors.shape)

                raise

            # Create histogram
            # - Using explicit bin slots to prevent numpy from automatically
            #   creating tightly constrained bins. This would otherwise cause
            #   histograms between two inputs to be non-comparable (unaligned
            #   bins).
            # - See numpy note about ``bins`` to understand why the +1 is
            #   necessary
            # - Learned from spatial implementation that we could feed multiple
            #   neighbors per descriptor into here, leading to a more populated
            #   histogram.
            #   - Could also possibly weight things based on dist from
            #     descriptor?
            #: :type: numpy.core.multiarray.ndarray
            h = numpy.histogram(idxs,  # indices are all integers
                                bins=numpy.arange(self._codebook.shape[0]+1))[0]
            # self.log.debug("Quantization histogram: %s", h)
            # Normalize histogram into relative frequencies
            # - Not using /= on purpose. h is originally int32 coming out of
            #   histogram. /= would keep int32 type when we want it to be
            #   transformed into a float type by the division.
            if h.sum():
                # noinspection PyAugmentAssignment
                h = h / float(h.sum())
            else:
                h = numpy.zeros(h.shape, h.dtype)
            # self.log.debug("Normalized histogram: %s", h)

        else:
            ###
            # Spatial Pyramid Quantization
            #
            self.log.debug("Quantizing descriptors using spatial pyramid")
            ##
            # Quantization factor - number of nearest codes to be saved
            q_factor = 10
            ##
            # Concatenating spatial information to descriptor vectors to format:
            #   [ x y <descriptor> ]
            self.log.debug("Creating combined descriptor matrix")
            m = numpy.concatenate((info[:, :2],
                                   descriptors), axis=1)
            ##
            # Creating quantized vectors, consisting vector:
            #   [ x y c_1 ... c_qf dist_1 ... dist_qf ]
            # which has a total size of 2+(qf*2)
            #
            # Sangmin's code included the distances in the quantized vector, but
            # then also passed this vector into numpy's histogram function with
            # integral bins, causing the [0,1] to be heavily populated, which
            # doesn't make sense to do.
            #   idxs, dists = flann.nn_index(m[:, 2:], q_factor)
            #   q = numpy.concatenate([m[:, :2], idxs, dists], axis=1)
            self.log.debug("Computing nearest neighbors")
            pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION)
            flann = pyflann.FLANN()
            flann.load_index(self.flann_index_filepath, self._codebook)
            idxs = flann.nn_index(m[:, 2:], q_factor)[0]
            self.log.debug("Creating quantization matrix")
            q = numpy.concatenate([m[:, :2], idxs], axis=1)
            ##
            # Build spatial pyramid from quantized matrix
            self.log.debug("Building spatial pyramid histograms")
            hist_sp = self._build_sp_hist(q, self._codebook.shape[0])
            ##
            # Combine each quadrants into single vector
            self.log.debug("Combining global+thirds into final histogram.")
            f = sys.float_info.min  # so as we don't div by 0 accidentally
            rf_norm = lambda h: h / (float(h.sum()) + f)
            h = numpy.concatenate([rf_norm(hist_sp[0]),
                                   rf_norm(hist_sp[5]),
                                   rf_norm(hist_sp[6]),
                                   rf_norm(hist_sp[7])],
                                  axis=1)
            # noinspection PyAugmentAssignment
            h /= h.sum()

        self.log.debug("Saving checkpoint feature file")
        if not osp.isdir(osp.dirname(checkpoint_filepath)):
            safe_create_dir(osp.dirname(checkpoint_filepath))
        numpy.save(checkpoint_filepath, h)

        return h
Example #24
0
    def _build_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to build the index
        with the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        Implementation Notes:
            - We keep a cache file serialization around for our index in case
                sub-processing occurs so as to be able to recover from the
                underlying C data not being there. This could cause issues if
                a main or child process rebuild's the index, as we clear the
                old cache away.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            # Not caring about restoring the index because we're just making a
            # new one.
            self._log.info("Building new FLANN index")

            self._log.debug("Caching descriptor elements")
            self._descr_cache = list(descriptors)
            # Cache descriptors if we have an element
            if self._descr_cache_elem and self._descr_cache_elem.writable():
                self._log.debug("Caching descriptors: %s",
                                self._descr_cache_elem)
                self._descr_cache_elem.set_bytes(
                    cPickle.dumps(self._descr_cache, -1)
                )

            params = {
                "target_precision": self._build_target_precision,
                "sample_fraction": self._build_sample_frac,
                "log_level": ("info"
                              if self._log.getEffectiveLevel() <= logging.DEBUG
                              else "warning")
            }
            if self._build_autotune:
                params['algorithm'] = "autotuned"
            if self._rand_seed is not None:
                params['random_seed'] = self._rand_seed
            pyflann.set_distance_type(self._distance_method)

            self._log.debug("Accumulating descriptor vectors into matrix for "
                            "FLANN")
            pts_array = elements_to_matrix(self._descr_cache,
                                           report_interval=1.0)

            self._log.debug('Building FLANN index')
            self._flann = pyflann.FLANN()
            self._flann_build_params = self._flann.build_index(pts_array,
                                                               **params)
            del pts_array

            if self._index_elem and self._index_elem.writable():
                self._log.debug("Caching index: %s", self._index_elem)
                # FLANN wants to write to a file, so make a temp file, then
                # read it in, putting bytes into element.
                fd, fp = tempfile.mkstemp()
                try:
                    self._flann.save_index(fp)
                    # Use the file descriptor to create the file object.
                    # This avoids reopening the file and will automatically
                    # close the file descriptor on exiting the with block.
                    # fdopen() is required because in Python 2 open() does
                    # not accept a file descriptor.
                    with os.fdopen(fd, 'rb') as f:
                        self._index_elem.set_bytes(f.read())
                finally:
                    os.remove(fp)
            if self._index_param_elem and self._index_param_elem.writable():
                self._log.debug("Caching index params: %s",
                                self._index_param_elem)
                state = {
                    'b_autotune': self._build_autotune,
                    'b_target_precision': self._build_target_precision,
                    'b_sample_frac': self._build_sample_frac,
                    'distance_method': self._distance_method,
                    'flann_build_params': self._flann_build_params,
                }
                self._index_param_elem.set_bytes(cPickle.dumps(state, -1))

            self._pid = multiprocessing.current_process().pid
Example #25
0
    def generate_model(self, data_set, **kwargs):
        """
        Generate this feature detector's data-model given a file ingest. This
        saves the generated model to the currently configured data directory.

        For colorDescriptor, we generate raw features over the ingest data,
        compute a codebook via kmeans, and then create an index with FLANN via
        the "autotune" algorithm to intelligently pick the fastest indexing
        method.

        :param num_elements: Number of data elements in the iterator
        :type num_elements: int

        :param data_set: Set of input data elements to generate the model
            with.
        :type data_set: collections.Set[smqtk.data_rep.DataElement]

        """
        super(ColorDescriptor_Base, self).generate_model(data_set, **kwargs)

        if self.has_model:
            self.log.warn("ColorDescriptor model for descriptor type '%s' "
                          "already generated!", self.descriptor_type())
            return

        pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION)
        flann = pyflann.FLANN()

        if not osp.isfile(self.codebook_filepath):
            self.log.info("Did not find existing ColorDescriptor codebook for "
                          "descriptor '%s'.", self.descriptor_type())

            # generate descriptors
            with SimpleTimer("Generating descriptor matrices...",
                             self.log.info):
                descriptors_checkpoint = osp.join(self._work_dir,
                                                  "model_descriptors.npy")

                if osp.isfile(descriptors_checkpoint):
                    self.log.debug("Found existing computed descriptors work "
                                   "file for model generation.")
                    descriptors = numpy.load(descriptors_checkpoint)
                else:
                    self.log.debug("Computing model descriptors")
                    _, descriptors = \
                        self._generate_descriptor_matrices(
                            data_set,
                            limit=self.CODEBOOK_DESCRIPTOR_LIMIT
                        )
                    _, tmp = tempfile.mkstemp(dir=self._work_dir,
                                              suffix='.npy')
                    self.log.debug("Saving model-gen info/descriptor matrix")
                    numpy.save(tmp, descriptors)
                    os.rename(tmp, descriptors_checkpoint)

            # Compute centroids (codebook) with kmeans
            with SimpleTimer("Computing sklearn.cluster.MiniBatchKMeans...",
                             self.log.info):
                kmeans_verbose = self.log.getEffectiveLevel <= logging.DEBUG
                kmeans = sklearn.cluster.MiniBatchKMeans(
                    n_clusters=self._kmeans_k,
                    init_size=self._kmeans_k*3,
                    random_state=self._rand_seed,
                    verbose=kmeans_verbose,
                    compute_labels=False,
                )
                kmeans.fit(descriptors)
                codebook = kmeans.cluster_centers_
            with SimpleTimer("Saving generated codebook...", self.log.debug):
                numpy.save(self.codebook_filepath, codebook)
        else:
            self.log.info("Found existing codebook file.")
            codebook = numpy.load(self.codebook_filepath)

        # create FLANN index
        # - autotune will force select linear search if there are < 1000 words
        #   in the codebook vocabulary.
        if self.log.getEffectiveLevel() <= logging.DEBUG:
            log_level = 'info'
        else:
            log_level = 'warning'
        with SimpleTimer("Building FLANN index...", self.log.info):
            p = {
                "target_precision": self._flann_target_precision,
                "sample_fraction": self._flann_sample_fraction,
                "log_level": log_level,
                "algorithm": "autotuned"
            }
            if self._rand_seed is not None:
                p['random_seed'] = self._rand_seed
            flann_params = flann.build_index(codebook, **p)
        with SimpleTimer("Saving FLANN index to file...", self.log.debug):
            # Save FLANN index data binary
            flann.save_index(self.flann_index_filepath)
            # Save out log of parameters
            with open(self.flann_params_filepath, 'w') as ofile:
                json.dump(flann_params, ofile, indent=4, sort_keys=True)

        # save generation results to class for immediate feature computation use
        self._codebook = codebook
Example #26
0
    index_gensim.num_best = TOP_N
    logger.info("finished gensim index %s" % index_gensim)

    logger.info("loading mapping between article titles and ids")
    id2title = gensim.utils.unpickle(os.path.join(indir, 'id2title'))
    title2id = dict((title.lower(), pos) for pos, title in enumerate(id2title))
    # print_similar('Anarchism', index_gensim, id2title, title2id)

    if 'gensim' in program:
        # log_precision(gensim_predictions, index_gensim, queries, index_gensim)
        gensim_at_once(index_gensim, queries)
        gensim_1by1(index_gensim, queries)

    if 'flann' in program:
        import pyflann
        pyflann.set_distance_type('euclidean')
        index_flann = pyflann.FLANN()
        flann_fname = sim_prefix + "_flann_%s" % ACC
        if os.path.exists(flann_fname):
            logger.info("loading flann index")
            index_flann.load_index(flann_fname, clipped)
        else:
            logger.info("building FLANN index")
            # flann expects index vectors as a 2d numpy array, features = columns
            params = index_flann.build_index(clipped, **ACC_SETTINGS['flann'][ACC])
            logger.info("built flann index with %s" % params)
            index_flann.save_index(flann_fname)
        logger.info("finished FLANN index")

        log_precision(flann_predictions, index_flann, queries, index_gensim)
        flann_1by1(index_flann, queries)
Example #27
0
import sys
import numpy
#import pylab
import marsyas
import marsyas_util
import pdb
import pyflann
from pyflann import *
from numpy import *
#from numpy.random import *
import cPickle
import random

#PLOT = True
PLOT = False
pyflann.set_distance_type('kl')
flann = FLANN()
topn = 20
buffsize = 512

#texture = ["Rms/rms", "AubioYin/pitcher","ZeroCrossings/zcrs" ,"Series/lspbranch" ,"Series/lpccbranch" ,"MFCC/mfcc" ,"SCF/scf" ,"Rolloff/rf" ,"Flux/flux" ,"Centroid/cntrd" ,"Series/chromaPrSeries"]
texture = ["Rms/rms", "AubioYin/pitcher","ZeroCrossings/zcrs" ,"Rolloff/rf" ,"Flux/flux" ,"Centroid/cntrd","AbsMax/abs","Energy/energy"]

#"AimGammatone/aimgamma"]
detectors = ["Fanout/detectors", texture]

grainuri = "RealvecGrainSource/real_src"


class Slice:
    stats = []
Example #28
0
    def __init__(self, Xin, NNtype='knn', use_flann=False, center=True,
                 rescale=True, k=10, sigma=None, epsilon=0.01,
                 plotting={}, symmetrize_type='average', dist_type='euclidean',
                 order=0, **kwargs):

        self.Xin = Xin
        self.NNtype = NNtype
        self.use_flann = use_flann
        self.center = center
        self.rescale = rescale
        self.k = k
        self.sigma = sigma
        self.epsilon = epsilon
        self.symmetrize_type = symmetrize_type
        self.dist_type = dist_type
        self.order = order

        N, d = np.shape(self.Xin)
        Xout = self.Xin

        if k >= N:
            raise ValueError('The number of neighbors (k={}) must be smaller '
                             'than the number of nodes ({}).'.format(k, N))

        if self.center:
            Xout = self.Xin - np.kron(np.ones((N, 1)),
                                      np.mean(self.Xin, axis=0))

        if self.rescale:
            bounding_radius = 0.5 * np.linalg.norm(np.amax(Xout, axis=0) -
                                                   np.amin(Xout, axis=0), 2)
            scale = np.power(N, 1. / float(min(d, 3))) / 10.
            Xout *= scale / bounding_radius

        # Translate distance type string to corresponding Minkowski order.
        dist_translation = {"euclidean": 2,
                            "manhattan": 1,
                            "max_dist": np.inf,
                            "minkowski": order
                            }

        if self.NNtype == 'knn':
            spi = np.zeros((N * k))
            spj = np.zeros((N * k))
            spv = np.zeros((N * k))

            if self.use_flann:
                pfl = _import_pfl()
                pfl.set_distance_type(dist_type, order=order)
                flann = pfl.FLANN()

                # Default FLANN parameters (I tried changing the algorithm and
                # testing performance on huge matrices, but the default one
                # seems to work best).
                NN, D = flann.nn(Xout, Xout, num_neighbors=(k + 1),
                                 algorithm='kdtree')

            else:
                kdt = spatial.KDTree(Xout)
                D, NN = kdt.query(Xout, k=(k + 1),
                                  p=dist_translation[dist_type])

            if self.sigma is None:
                self.sigma = np.mean(D[:, 1:])  # Discard distance to self.

            for i in range(N):
                spi[i * k:(i + 1) * k] = np.kron(np.ones((k)), i)
                spj[i * k:(i + 1) * k] = NN[i, 1:]
                spv[i * k:(i + 1) * k] = np.exp(-np.power(D[i, 1:], 2) /
                                                float(self.sigma))

        elif self.NNtype == 'radius':

            kdt = spatial.KDTree(Xout)
            D, NN = kdt.query(Xout, k=None, distance_upper_bound=epsilon,
                              p=dist_translation[dist_type])
            if self.sigma is None:
                # Discard distance to self.
                self.sigma = np.mean([np.mean(d[1:]) for d in D])
            count = 0
            for i in range(N):
                count = count + len(NN[i])

            spi = np.zeros((count))
            spj = np.zeros((count))
            spv = np.zeros((count))

            start = 0
            for i in range(N):
                leng = len(NN[i]) - 1
                spi[start:start + leng] = np.kron(np.ones((leng)), i)
                spj[start:start + leng] = NN[i][1:]
                spv[start:start + leng] = np.exp(-np.power(D[i][1:], 2) /
                                                 float(self.sigma))
                start = start + leng

        else:
            raise ValueError('Unknown NNtype {}'.format(self.NNtype))

        W = sparse.csc_matrix((spv, (spi, spj)), shape=(N, N))

        # Sanity check
        if np.shape(W)[0] != np.shape(W)[1]:
            raise ValueError('Weight matrix W is not square')

        # Enforce symmetry. Note that checking symmetry with
        # np.abs(W - W.T).sum() is as costly as the symmetrization itself.
        W = utils.symmetrize(W, method=symmetrize_type)

        super(NNGraph, self).__init__(W, plotting=plotting,
                                      coords=Xout, **kwargs)