def _load_flann_model(self): if not self._descr_cache and not self._descr_cache_elem.is_empty(): # Load descriptor cache # - is copied on fork, so only need to load here. self._log.debug("Loading cached descriptors") self._descr_cache = \ cPickle.loads(self._descr_cache_elem.get_bytes()) # Params pickle include the build params + our local state params if self._index_param_elem and not self._index_param_elem.is_empty(): state = cPickle.loads(self._index_param_elem.get_bytes()) self._build_autotune = state['b_autotune'] self._build_target_precision = state['b_target_precision'] self._build_sample_frac = state['b_sample_frac'] self._distance_method = state['distance_method'] self._flann_build_params = state['flann_build_params'] # Load the binary index if self._index_elem and not self._index_elem.is_empty(): # make numpy matrix of descriptor vectors for FLANN pts_array = [d.vector() for d in self._descr_cache] pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype) pyflann.set_distance_type(self._distance_method) self._flann = pyflann.FLANN() tmp_fp = self._index_elem.write_temp() self._flann.load_index(tmp_fp, pts_array) self._index_elem.clean_temp() del pts_array, tmp_fp # Set current PID to the current self._pid = multiprocessing.current_process().pid
def load_index(self, dir_path): """ Load a saved index state based on the current configuration. :raises SimilarityIndexStateLoadError: Could not load index state. :param dir_path: Path to the directory to load the index to. :type dir_path: str """ self._restore_index() if False in (osp.isfile(self._sf_flann_index), osp.isfile(self._sf_state)): raise SimilarityIndexStateLoadError("In complete index save state") dir_path = osp.abspath(osp.expanduser(dir_path)) with open(osp.join(dir_path, self._sf_state), 'rb') as f: state = cPickle.load(f) self._distance_method = state['distance_method'] self._rand_seed = state['rand_seed'] self._descr_cache = state['descr_cache'] self._flann_build_params = state['flann_params'] pts_array = [d.vector() for d in self._descr_cache] pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype) pyflann.set_distance_type(self._distance_method) self._flann = pyflann.FLANN() self._flann.load_index(osp.join(dir_path, self._sf_flann_index), pts_array)
def _load_flann_model(self): if not self._descr_cache and self._descr_cache_filepath: # Load descriptor cache # - is copied on fork, so only need to load here. self._log.debug("Loading cached descriptors") with open(self._descr_cache_filepath, "rb") as f: self._descr_cache = cPickle.load(f) # Params pickle include the build params + our local state params if self._index_param_filepath: with open(self._index_param_filepath) as f: state = cPickle.load(f) self._build_autotune = state["b_autotune"] self._build_target_precision = state["b_target_precision"] self._build_sample_frac = state["b_sample_frac"] self._distance_method = state["distance_method"] self._flann_build_params = state["flann_build_params"] # Load the binary index if self._index_filepath: # make numpy matrix of descriptor vectors for FLANN pts_array = [d.vector() for d in self._descr_cache] pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype) pyflann.set_distance_type(self._distance_method) self._flann = pyflann.FLANN() self._flann.load_index(self._index_filepath, pts_array) del pts_array # Set current PID to the current self._pid = multiprocessing.current_process().pid
def build_geom_neighbor_graph(geoms, n_neighbors): """ Computes the sparse CSR geometrical adjacency matrix gadj Parameters ---------- geoms: (n_pts, d) array, the geometrical info n_neighbors: int, number of neighbors Returns ------- gadj: (n_pts, n_pts) sparse CSR array, the adjacency matrix gadj[i,j] == 1 iff i and j are geometrical neighbors Notes ----- gadj might not be symmetric! """ n_pts = geoms.shape[0] pyflann.set_distance_type('euclidean') # squared euclidean actually fli = pyflann.FLANN() build_params = dict(algorithm='kdtree', num_neighbors=n_neighbors) gneighbs, _ = fli.nn(geoms, geoms, **build_params) data = np.ones((n_pts, n_neighbors), dtype='u1') indptr = np.arange(0, n_pts * n_neighbors + 1, n_neighbors, dtype=int) gadj = sparse.csr_matrix( (data.ravel(), gneighbs.ravel(), indptr), shape=(n_pts, n_pts)) return gadj
def rank_model(self, pos_ids, neg_ids=()): """ Rank the current model, returning a mapping of element IDs to a ranking valuation. This valuation should be a probability in the range of [0, 1], where 1.0 is the highest rank and 0.0 is the lowest rank. :raises RuntimeError: No current model. See implementation for other possible RuntimeError causes. :param pos_ids: List of positive data IDs :type pos_ids: collections.Iterable of int :param neg_ids: List of negative data IDs :type neg_ids: collections.Iterable of int :return: Mapping of ingest ID to a rank. :rtype: dict of (int, float) """ super(NearestNeighbor_HIK_Base, self).rank_model(pos_ids, neg_ids) self.log.debug("ND_HIK source exemplars:\n" "Pos: %s\n" "Neg: %s", pos_ids, neg_ids) # TODO: add auto-negative selection? # Construct / use cached FLANN index from feature data pyflann.set_distance_type('cs') # chi squared flann = pyflann.FLANN() flann.build_index(self._feature_mat, **{ "log_level": "info", }) # Find positive/negative centroids pos_centroids = self._feature_mat[[self._uid_idx_map[pid] for pid in pos_ids]] #: :type: numpy.core.multiarray.ndarray pos_avg_c = pos_centroids.sum(axis=0) / float(len(pos_ids)) idxs, dists = flann.nn_index(pos_avg_c, self._feature_mat.shape[0]) pos_dists = numpy.array([v[1] for v in sorted(zip(idxs[0], dists[0]), key=lambda e: e[0])]) if neg_ids: neg_centroids = self._feature_mat[[self._uid_idx_map[nid] for nid in neg_ids]] #: :type: numpy.core.multiarray.ndarray neg_avg_c = neg_centroids.sum(axis=0) / float(len(neg_ids)) idxs, dists = flann.nn_index(neg_avg_c, self._feature_mat.shape[0]) neg_dists = numpy.array([v[1] for v in sorted(zip(idxs[0], dists[0]), key=lambda e: e[0])]) idx_rank = pos_dists / neg_dists else: idx_rank = pos_dists # Constrain to [0,1] range and associate to UIDs idx_rank = 1.0 - (idx_rank / idx_rank.max()) d = dict(zip(self._uid_array, idx_rank)) return d
def __init__(self, update_frequency, send_socket, video_source_name, distance_type = 'euclidean', vision_window_name='VISION', detection_window_name='DETECTION', surf_params=(0, 300, 3, 4), filter=False): """Initialize vision system with update frequency, ip adress, and example image""" self.logger = logging.getLogger("Borg.Brain.Vision.SurfDetect") self.update_frequency = update_frequency self.surf_params = surf_params self.n_octaves = self.surf_params[2] self.names = [] self.sizes = {} self.images = {} self.keypoints = [] self.descriptors = [] self.displaynames = {} self.modelKeypoints = {} self.destination = 'default' self.vid_mem_reader = util.vidmemreader.VidMemReader([video_source_name]) self.send_socket = send_socket pyflann.set_distance_type(distance_type) self.vision_window = vision_window_name self.detection_window = detection_window_name self.imageSize = () self.filter = filter #If False SURF detections won't be filtered by get_ratio() self.reasonable_ratio = True self.min_ratio = .05 self.max_ratio = 3 self.min_angle = math.radians(70) self.max_angle = math.radians(110) #Object with the update() function to be called (TODO: Cleanup this code): self.update_object = None
def build_index(self, index_filename=None): tt.tic('build_index') pyflann.set_distance_type(self.distance_metric) self.flann = pyflann.FLANN() if index_filename: self.flann.load_index(index_filename, self.hists_reduced) else: self.params = self.flann.build_index( self.hists_reduced, algorithm='autotuned', sample_fraction=0.3, target_precision=.8, build_weight=0.01, memory_weight=0.) print(self.params) tt.toc('build_index') return self
def _restore_index(self): """ If we think we're suppose to have an index, check the recorded PID with the current PID, reloading the index from cache if they differ. If there is a loaded index and we're on the same process that created it this does nothing. """ if self._flann_index_cache and os.path.isfile(self._flann_index_cache) \ and self._pid != multiprocessing.current_process().pid: pyflann.set_distance_type(self._distance_method) self._flann = pyflann.FLANN() pts_array = [d.vector() for d in self._descr_cache] pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype) self._flann.load_index(self._flann_index_cache, pts_array) self._pid = multiprocessing.current_process().pid
def __init__(self, C, Q, cell, outfile, barrier=None): assert len(C.params) == len(PARAMS_DEFAULT) threading.Thread.__init__(self) self.qpath = Q.siftpath if type(cell) is list: self.cellpath = [os.path.join(C.dbdir, c) for c in cell] else: self.cellpath = os.path.join(C.dbdir, cell) self.infodir = C.infodir self.celldir = C.dbdir self.outfile = outfile self.params = C.params self.criteria = C.criteria self.barrier = barrier self.dump = self.outfile + ('-detailed%s.npy' % DETAIL_VERSION) pyflann.set_distance_type(C.params['distance_type']) self.reader = reader.get_reader(C.params['descriptor'])
def _load_flann_model(self): # Params pickle include the build params + our local state params with open(self._index_param_filepath) as f: state = cPickle.load(f) self._build_autotune = state['b_autotune'] self._build_target_precision = state['b_target_precision'] self._build_sample_frac = state['b_sample_frac'] self._distance_method = state['distance_method'] self._flann_build_params = state['flann_build_params'] # make numpy matrix of descriptor vectors for FLANN pts_array = [d.vector() for d in self._descr_cache] pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype) pyflann.set_distance_type(self._distance_method) self._flann = pyflann.FLANN() self._flann.load_index(self._index_filepath, pts_array) del pts_array # Set current PID to the current self._pid = multiprocessing.current_process().pid
def parse_args(): parser = argparse.ArgumentParser(description='Mostitch!') parser.add_argument('--buffsize', default=1024, help='Buffer Size') parser.add_argument('--csound', default=False, help='Print Csound Stuff') parser.add_argument('--distance', default='euclidean', help='What Distance type to use: euclidean kl manhattan minkowski hik hellinger cs') parser.add_argument('--learn', default=False, help='Turn Learning on or Off') parser.add_argument('--window', default="hann", help='Which window function to use: hann saw flat triangle') parser.add_argument('--mingrains', default=10, help='Minimum number of grains') parser.add_argument('--maxgrains', default=100, help='Maximum number of grains') parser.add_argument('--topn', default=20, help='Top N from NN') parser.add_argument('files', help='Filenames',nargs='+') args = parser.parse_args() buffsize = int(args.buffsize) csound = args.csound learning = args.learn myfiles = args.files pyflann.set_distance_type(args.distance) topn = int(args.topn) window_name = args.window maxgrains = int(args.maxgrains) mingrains = int(args.mingrains) state = { "maxgrains":maxgrains, "mingrains":mingrains, "amp":0.2, "topn":topn, "delay":3*buffsize, "learning":learning } settings = { "files":myfiles, "window_name":window_name, "csound":csound, "state":state, "buffsize":buffsize } return settings
def build_index(self, descriptors): """ Build the index over the descriptors data elements. Subsequent calls to this method should rebuild the index, not add to it. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptors elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # Not caring about restoring the index because we're just making a new # one self._log.info("Building new FLANN index") self._log.debug("Storing descriptors") self._descr_cache = list(descriptors) if not self._descr_cache: raise ValueError("No data provided in given iterable.") # Cache descriptors if we have a path if self._descr_cache_filepath: self._log.debug("Caching descriptors: %s", self._descr_cache_filepath) safe_create_dir(osp.dirname(self._descr_cache_filepath)) with open(self._descr_cache_filepath, "wb") as f: cPickle.dump(self._descr_cache, f) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning"), } if self._build_autotune: params["algorithm"] = "autotuned" if self._rand_seed is not None: params["random_seed"] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._log.debug("Accumulating descriptor vectors into matrix for FLANN") pts_array = [d.vector() for d in self._descr_cache] pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype) self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index(pts_array, **params) del pts_array self._log.debug("Caching index and state: %s, %s", self._index_filepath, self._index_param_filepath) if self._index_filepath: self._log.debug("Caching index: %s", self._index_filepath) safe_create_dir(osp.dirname(self._index_filepath)) self._flann.save_index(self._index_filepath) if self._index_param_filepath: self._log.debug("Caching index params: %s", self._index_param_filepath) state = { "b_autotune": self._build_autotune, "b_target_precision": self._build_target_precision, "b_sample_frac": self._build_sample_frac, "distance_method": self._distance_method, "flann_build_params": self._flann_build_params, } safe_create_dir(osp.dirname(self._index_param_filepath)) with open(self._index_param_filepath, "w") as f: cPickle.dump(state, f) self._pid = multiprocessing.current_process().pid
def generate_model(self, data_set): """ Generate this feature detector's data-model given a file ingest. This saves the generated model to the currently configured data directory. For colorDescriptor, we generate raw features over the ingest data, compute a codebook via kmeans, and then create an index with FLANN via the "autotune" or linear algorithm to intelligently pick the fastest indexing method. :param data_set: Set of input data elements to generate the model with. :type data_set: collections.Set[smqtk.representation.DataElement] """ if self.has_model: self._log.warn("ColorDescriptor model for descriptor type '%s' " "already generated!", self.descriptor_type()) return # Check that input data is value for processing through colorDescriptor valid_types = self.valid_content_types() invalid_types_found = set() for di in data_set: if di.content_type() not in valid_types: invalid_types_found.add(di.content_type()) if invalid_types_found: self._log.error("Found one or more invalid content types among " "input:") for t in sorted(invalid_types_found): self._log.error("\t- '%s", t) raise ValueError("Discovered invalid content type among input " "data: %s" % sorted(invalid_types_found)) if not osp.isfile(self.codebook_filepath): self._log.info("Did not find existing ColorDescriptor codebook for " "descriptor '%s'.", self.descriptor_type()) # generate descriptors with SimpleTimer("Generating descriptor matrices...", self._log.info): descriptors_checkpoint = osp.join(self._work_dir, "model_descriptors.npy") if osp.isfile(descriptors_checkpoint): self._log.debug("Found existing computed descriptors work " "file for model generation.") descriptors = numpy.load(descriptors_checkpoint) else: self._log.debug("Computing model descriptors") _, descriptors = \ self._generate_descriptor_matrices( data_set, limit=self._model_gen_descriptor_limit ) _, tmp = tempfile.mkstemp(dir=self._work_dir, suffix='.npy') self._log.debug("Saving model-gen info/descriptor matrix") numpy.save(tmp, descriptors) os.rename(tmp, descriptors_checkpoint) # Compute centroids (codebook) with kmeans with SimpleTimer("Computing sklearn.cluster.MiniBatchKMeans...", self._log.info): kmeans_verbose = self._log.getEffectiveLevel <= logging.DEBUG kmeans = sklearn.cluster.MiniBatchKMeans( n_clusters=self._kmeans_k, init_size=self._kmeans_k*3, random_state=self._rand_seed, verbose=kmeans_verbose, compute_labels=False, ) kmeans.fit(descriptors) codebook = kmeans.cluster_centers_ with SimpleTimer("Saving generated codebook...", self._log.debug): numpy.save(self.codebook_filepath, codebook) else: self._log.info("Found existing codebook file.") codebook = numpy.load(self.codebook_filepath) # create FLANN index # - autotune will force select linear search if there are < 1000 words # in the codebook vocabulary. pyflann.set_distance_type(self._flann_distance_metric) flann = pyflann.FLANN() if self._log.getEffectiveLevel() <= logging.DEBUG: log_level = 'info' else: log_level = 'warning' with SimpleTimer("Building FLANN index...", self._log.info): p = { "target_precision": self._flann_target_precision, "sample_fraction": self._flann_sample_fraction, "log_level": log_level, } if self._flann_autotune: p['algorithm'] = "autotuned" if self._rand_seed is not None: p['random_seed'] = self._rand_seed flann_params = flann.build_index(codebook, **p) with SimpleTimer("Saving FLANN index to file...", self._log.debug): # Save FLANN index data binary flann.save_index(self.flann_index_filepath) # Save out log of parameters with open(self.flann_params_filepath, 'w') as ofile: json.dump(flann_params, ofile, indent=4, sort_keys=True) # save generation results to class for immediate feature computation use self._codebook = codebook
def compute_feature(self, data, no_checkpoint=False): """ Given some kind of data, process and return a feature vector as a Numpy array. :raises RuntimeError: Feature extraction failure of some kind. :param data: Some kind of input data for the feature descriptor. This is descriptor dependent. :type data: SMQTK.utils.DataFile.DataFile or SMQTK.utils.VideoFile.VideoFile :param no_checkpoint: Normally, we produce a checkpoint file, which contains the numpy feature vector for a given video so that it may be loaded instead of re-computed if the same video is visited again. If this is True, we do not save such a file to our work directory. :return: Feature vector. This is a histogram of N bins where N is the number of centroids in the codebook. Bin values is percent composition, not absolute counts. :rtype: numpy.ndarray """ checkpoint_filepath = self._get_checkpoint_feature_file(data) if osp.isfile(checkpoint_filepath): # self.log.debug("Found checkpoint feature vector file, loading and " # "returning.") return numpy.load(checkpoint_filepath) if not self.has_model: raise RuntimeError("No model currently loaded! Check the existence " "or, or generate, model files!\n" "Codebook path: %s\n" "FLANN Index path: %s" % (self.codebook_filepath, self.flann_index_filepath)) self.log.debug("Computing descriptors...") info, descriptors = self._generate_descriptor_matrices(data) # Quantization # - loaded the model at class initialization if we had one self.log.debug("Quantizing descriptors") pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION) flann = pyflann.FLANN() flann.load_index(self.flann_index_filepath, self._codebook) idxs, dists = flann.nn_index(descriptors) # Create histogram # - Using explicit bin slots to prevent numpy from automatically # creating tightly constrained bins. This would otherwise cause # histograms between two inputs to be non-comparable (unaligned bins). # - See numpy note about ``bins`` to understand why the +1 is necessary h, b = numpy.histogram(idxs, # indices are all integers bins=numpy.arange(self._codebook.shape[0] + 1)) # self.log.debug("Quantization histogram: %s", h) # Normalize histogram into relative frequencies # - Not using /= on purpose. h is originally int32 coming out of # histogram. /= would keep int32 type when we want it to be # transformed into a float type by the division. if h.sum(): h = h / float(h.sum()) else: h = numpy.zeros(h.shape, h.dtype) # self.log.debug("Normalized histogram: %s", h) if not no_checkpoint: self.log.debug("Saving checkpoint feature file") if not osp.isdir(osp.dirname(checkpoint_filepath)): safe_create_dir(osp.dirname(checkpoint_filepath)) numpy.save(checkpoint_filepath, h) return h
def generate_model(self, data_list, parallel=None, **kwargs): """ Generate this feature detector's data-model given a file ingest. This saves the generated model to the currently configured data directory. For colorDescriptor, we generate raw features over the ingest data, compute a codebook via kmeans, and then create an index with FLANN via the "autotune" algorithm to intelligently pick the fastest indexing method. :param data_list: List of input data elements to generate model with. :type data_list: list of SMQTK.utils.DataFile.DataFile or tuple of SMQTK.utils.DataFile.DataFile :param parallel: Optionally specification of how many processors to use when pooling sub-tasks. If None, we attempt to use all available cores. :type parallel: int Additional optional key-word arguments ====================================== :param kmeans_k: Centroids to generate. Default of 1024 :type kmeans_k: int :param kmeans_iter: Number of times to run the kmeans algorithms, using the centroids from the best run. Default of 5. :type kmeans_iter: int :param kmeans_threshold: Distortion difference termination threshold. KMeans algorithm terminates during a run if the centroid distortion since the last iteration is less than this threshold. Default of 1e-5. :type kmeans_threshold: float :param flann_target_precision: Target precision percent to tune index for. Default is 0.99 (99% accuracy). :type flann_target_precision: float :param flann_sample_fraction: Fraction of input data to use for index auto tuning. Default is 1.0 (100%). :type flann_sample_fraction: float """ if self.has_model: self.log.warn("ColorDescriptor model for descriptor type '%s' " "already generated!", self.descriptor_type()) return pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION) flann = pyflann.FLANN() if not osp.isfile(self.codebook_filepath): self.log.info("Did not find existing ColorDescriptor codebook for " "descriptor '%s'.", self.descriptor_type()) # generate descriptors with SimpleTimer("Generating descriptor matrices...", self.log.debug): descriptors_checkpoint = osp.join(self.work_directory, "model_descriptors.npy") if osp.isfile(descriptors_checkpoint): self.log.debug("Found existing computed descriptors work " "file for model generation.") descriptors = numpy.load(descriptors_checkpoint) else: self.log.debug("Computing model descriptors") _, descriptors = \ self._generate_descriptor_matrices( *data_list, limit=self.CODEBOOK_DESCRIPTOR_LIMIT ) _, tmp = tempfile.mkstemp(dir=self.work_directory, suffix='.npy') self.log.debug("Saving model-gen info/descriptor matrix") numpy.save(tmp, descriptors) os.rename(tmp, descriptors_checkpoint) # Compute centroids (codebook) with kmeans # - NOT performing whitening, as this transforms the feature space # in such a way that newly computed features cannot be applied to # the generated codebook as the same exact whitening # transformation would need to be applied in order for the # comparison to the codebook centroids to be valid. # - Alternate kmeans implementations: OpenCV, sklearn, pyflann with SimpleTimer("Computing scipy.cluster.vq.kmeans...", self.log.debug): codebook, distortion = scipy.cluster.vq.kmeans( descriptors, kwargs.get('kmeans_k', 1024), kwargs.get('kmeans_iter', 5), kwargs.get('kmeans_threshold', 1e-5) ) self.log.debug("KMeans result distortion: %f", distortion) # with SimpleTimer("Computing pyflann.FLANN.hierarchical_kmeans...", # self.log.debug): # # results in 1009 clusters (should, anyway, given the # # function's comment) # codebook2 = flann.hierarchical_kmeans(descriptors, 64, 16, 5) with SimpleTimer("Saving generated codebook...", self.log.debug): numpy.save(self.codebook_filepath, codebook) else: self.log.info("Found existing codebook file.") codebook = numpy.load(self.codebook_filepath) # create FLANN index # - autotune will force select linear search if there are < 1000 words # in the codebook vocabulary. if self.log.getEffectiveLevel() <= logging.DEBUG: log_level = 'info' else: log_level = 'warning' with SimpleTimer("Building FLANN index...", self.log.debug): params = flann.build_index(codebook, **{ "target_precision": kwargs.get("flann_target_precision", 0.99), "sample_fraction": kwargs.get("flann_sample_fraction", 1.0), "log_level": log_level, "algorithm": "autotuned" }) # TODO: Save params dict as JSON? with SimpleTimer("Saving FLANN index to file...", self.log.debug): flann.save_index(self.flann_index_filepath) # save generation results to class for immediate feature computation use self._codebook = codebook
def build_index(self, descriptors): """ Build the index over the descriptors data elements. Subsequent calls to this method should rebuild the index, not add to it. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptors elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # Not caring about restoring the index because we're just making a new # one self._log.info("Building new FLANN index") self._log.debug("Storing descriptors") self._descr_cache = list(descriptors) if not self._descr_cache: raise ValueError("No data provided in given iterable.") # Cache descriptors if we have an element if self._descr_cache_elem and self._descr_cache_elem.writable(): self._log.debug("Caching descriptors: %s", self._descr_cache_elem) self._descr_cache_elem.set_bytes( cPickle.dumps(self._descr_cache, -1)) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning") } if self._build_autotune: params['algorithm'] = "autotuned" if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._log.debug( "Accumulating descriptor vectors into matrix for FLANN") pts_array = elements_to_matrix(self._descr_cache, report_interval=1.0) self._log.debug('Building FLANN index') self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index(pts_array, **params) del pts_array if self._index_elem and self._index_elem.writable(): self._log.debug("Caching index: %s", self._index_elem) # FLANN wants to write to a file, so make a temp file, then read it # in, putting bytes into element. fd, fp = tempfile.mkstemp() try: self._flann.save_index(fp) self._index_elem.set_bytes(os.read(fd, os.path.getsize(fp))) finally: os.close(fd) os.remove(fp) if self._index_param_elem and self._index_param_elem.writable(): self._log.debug("Caching index params: %s", self._index_param_elem) state = { 'b_autotune': self._build_autotune, 'b_target_precision': self._build_target_precision, 'b_sample_frac': self._build_sample_frac, 'distance_method': self._distance_method, 'flann_build_params': self._flann_build_params, } self._index_param_elem.set_bytes(cPickle.dumps(state, -1)) self._pid = multiprocessing.current_process().pid
def build_index(self, descriptors): """ Build the index over the descriptors data elements. Subsequent calls to this method should rebuild the index, not add to it. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptors elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # Not caring about restoring the index because we're just making a new # one self._log.info("Building new FLANN index") self._log.debug("Storing descriptors") self._descr_cache = list(descriptors) if not self._descr_cache: raise ValueError("No data provided in given iterable.") # Cache descriptors if we have a path if self._descr_cache_filepath: self._log.debug("Caching descriptors: %s", self._descr_cache_filepath) safe_create_dir(osp.dirname(self._descr_cache_filepath)) with open(self._descr_cache_filepath, 'wb') as f: cPickle.dump(self._descr_cache, f) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning") } if self._build_autotune: params['algorithm'] = "autotuned" if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._log.debug("Accumulating descriptor vectors into matrix for FLANN") pts_array = [d.vector() for d in self._descr_cache] pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype) self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index(pts_array, **params) del pts_array self._log.debug("Caching index and state: %s, %s", self._index_filepath, self._index_param_filepath) if self._index_filepath: self._log.debug("Caching index: %s", self._index_filepath) safe_create_dir(osp.dirname(self._index_filepath)) self._flann.save_index(self._index_filepath) if self._index_param_filepath: self._log.debug("Caching index params: %s", self._index_param_filepath) state = { 'b_autotune': self._build_autotune, 'b_target_precision': self._build_target_precision, 'b_sample_frac': self._build_sample_frac, 'distance_method': self._distance_method, 'flann_build_params': self._flann_build_params, } safe_create_dir(osp.dirname(self._index_param_filepath)) with open(self._index_param_filepath, 'w') as f: cPickle.dump(state, f) self._pid = multiprocessing.current_process().pid
def _build_index(self, descriptors: Iterable[DescriptorElement]) -> None: """ Internal method to be implemented by sub-classes to build the index with the given descriptor data elements. Subsequent calls to this method should rebuild the current index. This method shall not add to the existing index nor raise an exception to as to protect the current index. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.abc.Iterable[smqtk.representation.DescriptorElement] """ with self._model_lock: # Not caring about restoring the index because we're just making a # new one. LOG.info("Building new FLANN index") LOG.debug("Caching descriptor elements") self._descr_cache = list(descriptors) # Cache descriptors if we have an element if self._descr_cache_elem and self._descr_cache_elem.writable(): LOG.debug(f"Caching descriptors: {self._descr_cache_elem}") self._descr_cache_elem.set_bytes( pickle.dumps(self._descr_cache, -1)) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if LOG.getEffectiveLevel() <= logging.DEBUG else "warning") } if self._build_autotune: params['algorithm'] = "autotuned" if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) LOG.debug("Accumulating descriptor vectors into matrix for FLANN") pts_array = numpy.asarray( list(parallel_map(lambda d_: d_.vector(), self._descr_cache))) LOG.debug('Building FLANN index') self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index( pts_array, **params) del pts_array if self._index_elem and self._index_elem.writable(): LOG.debug("Caching index: %s", self._index_elem) # FLANN wants to write to a file, so make a temp file, then # read it in, putting bytes into element. fd, fp = tempfile.mkstemp() try: self._flann.save_index(fp) # Use the file descriptor to create the file object. # This avoids reopening the file and will automatically # close the file descriptor on exiting the with block. # fdopen() is required because in Python 2 open() does # not accept a file descriptor. with os.fdopen(fd, 'rb') as f: self._index_elem.set_bytes(f.read()) finally: os.remove(fp) if self._index_param_elem and self._index_param_elem.writable(): LOG.debug(f"Caching index params: {self._index_param_elem}") state = { 'b_autotune': self._build_autotune, 'b_target_precision': self._build_target_precision, 'b_sample_frac': self._build_sample_frac, 'distance_method': self._distance_method, 'flann_build_params': self._flann_build_params, } self._index_param_elem.set_bytes(pickle.dumps(state, -1)) self._pid = multiprocessing.current_process().pid
def build_index(self, descriptors): """ Build the index over the descriptors data elements. Subsequent calls to this method should rebuild the index, not add to it. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptors elements to build index over. :type descriptors: collections.Iterable[smqtk.data_rep.DescriptorElement] """ # If there is already an index, clear the cache file if we are in the # same process that created our current index. if self._flann_index_cache and os.path.isfile(self._flann_index_cache) \ and self._pid == multiprocessing.current_process().pid: self._log.debug('removing old index cache file') os.remove(self._flann_index_cache) self._log.debug("Building new index") # Compute descriptors for data elements self._log.debug("Computing descriptors for data") # uid2vec = \ # self._content_descriptor.compute_descriptor_async(data) # Translate returned mapping into cache lists self._descr_cache = [d for d in sorted(descriptors, key=lambda e: e.uuid())] if not self._descr_cache: raise ValueError("No data provided in given iterable.") # numpy array version for FLANN pts_array = [d.vector() for d in self._descr_cache] pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype) # Reset PID/FLANN/saved cache self._pid = multiprocessing.current_process().pid safe_create_dir(self._temp_dir) fd, self._flann_index_cache = tempfile.mkstemp(".flann", dir=self._temp_dir) os.close(fd) self._log.debug("Building FLANN index") params = { "algorithm": self._build_autotune, "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warn") } if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index(pts_array, **params) # Saving out index cache self._log.debug("Saving index to cache file: %s", self._flann_index_cache) self._flann.save_index(self._flann_index_cache)
def tsdf2chamfer_fortex2shape(target_v, tsdf_gt, param_reg, param_gt, savepath1, savepath2): # Generate deformed coarse model voxelpath = "./models/TshapeCoarseTetraD.ply" v_mesh = pymesh.load_mesh(voxelpath) VoxCnt = v_mesh.num_voxels voxelArray = np.array(v_mesh.voxels) vertexArray = np.array(v_mesh.vertices, dtype=np.float32) vertexvoxelArray = vertexArray[voxelArray].reshape( voxelArray.shape[0], 1, 12)[:, 0].astype(np.float32) weights = np.load( './models/coarseweights.npy' ) #weights of the coarse tetrahedra volume interpolated by SMPL body J = np.load('./models/Tshapecoarsejoints.npy') J_shapedir = np.load("./models/J_shapedir.npy" ) # J_shapedir (can deform joints depend on 10 betas) # Deform coarse model kintree_table = np.array([[ 4294967295, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 12, 13, 14, 16, 17, 18, 19, 20, 21 ], [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ]]) vertexArrayD_gt, jointArrayD_gt = Warp.warpVolume(vertexArray, J, J_shapedir, param_gt["pose"], param_gt["betas"], kintree_table, weights) Size = np.array([1, 4, VoxCnt]) GPUManager = GPU.GPUManager() MarchingTetrahedra = My_MT.MarchingTetrahedra(Size, tsdf_gt, 0.0, vertexvoxelArray, voxelArray, vertexArrayD_gt, GPUManager) Vertices_gt = MarchingTetrahedra.run_CPU(tsdf_gt, 0.0, voxelArray, vertexArrayD_gt) MarchingTetrahedra.SaveToPly_CPU(savepath2) pyflann.set_distance_type("euclidean") flann = FLANN() flann.build_index(Vertices_gt, algorithm='kmeans', centers_init='kmeanspp', random_seed=1984) vertIds, dists = flann.nn_index(target_v, num_neighbors=1) chamfer_pyflann = np.average(dists) sourcepc = o3d.geometry.PointCloud() sourcepc.points = o3d.utility.Vector3dVector(Vertices_gt) targetpc = o3d.geometry.PointCloud() targetpc.points = o3d.utility.Vector3dVector(target_v) dists = np.array(targetpc.compute_point_cloud_distance(sourcepc)) chamfer_o3d = np.average(dists) return chamfer_o3d, chamfer_pyflann
def _compute_descriptor(self, data): """ Given some kind of data, process and return a feature vector as a Numpy array. :raises RuntimeError: Feature extraction failure of some kind. :param data: Some kind of input data for the feature descriptor. This is descriptor dependent. :type data: smqtk.representation.DataElement :return: Feature vector. This is a histogram of N bins where N is the number of centroids in the codebook. Bin values is percent composition, not absolute counts. :rtype: numpy.ndarray """ super(ColorDescriptor_Base, self)._compute_descriptor(data) checkpoint_filepath = self._get_checkpoint_feature_file(data) # if osp.isfile(checkpoint_filepath): # return numpy.load(checkpoint_filepath) if not self.has_model: raise RuntimeError("No model currently loaded! Check the existence " "or, or generate, model files!\n" "Codebook path: %s\n" "FLANN Index path: %s" % (self.codebook_filepath, self.flann_index_filepath)) self._log.debug("Computing descriptors for data UID[%s]...", data.uuid()) info, descriptors = self._generate_descriptor_matrices({data}) # Load FLANN components pyflann.set_distance_type(self._flann_distance_metric) flann = pyflann.FLANN() flann.load_index(self.flann_index_filepath, self._codebook) if not self._use_sp: ### # Codebook Quantization # # - loaded the model at class initialization if we had one self._log.debug("Quantizing descriptors") try: # If the distance method is HIK, we need to treat it special # since that method produces a similarity score, not a distance # score. # if self._flann_distance_metric == 'hik': # This searches for all NN instead of minimum between n and # the number of descriptors and keeps the last one because # hik is a similarity score and not a distance, which is # also why the values in dists is flipped below. #: :type: numpy.core.multiarray.ndarray, numpy.core.multiarray.ndarray idxs = flann.nn_index(descriptors, self._codebook.shape[0])[0] # Only keep the last index for each descriptor return idxs = numpy.array([i_array[-1] for i_array in idxs]) else: # :type: numpy.core.multiarray.ndarray, numpy.core.multiarray.ndarray idxs = flann.nn_index(descriptors, 1)[0] except AssertionError: self._log.error("Codebook shape : %s", self._codebook.shape) self._log.error("Descriptor shape: %s", descriptors.shape) raise # Create histogram # - Using explicit bin slots to prevent numpy from automatically # creating tightly constrained bins. This would otherwise cause # histograms between two inputs to be non-comparable (unaligned # bins). # - See numpy note about ``bins`` to understand why the +1 is # necessary # - Learned from spatial implementation that we could feed multiple # neighbors per descriptor into here, leading to a more populated # histogram. # - Could also possibly weight things based on dist from # descriptor? #: :type: numpy.core.multiarray.ndarray h = numpy.histogram(idxs, # indices are all integers bins=numpy.arange(self._codebook.shape[0]+1))[0] # self._log.debug("Quantization histogram: %s", h) # Normalize histogram into relative frequencies # - Not using /= on purpose. h is originally int32 coming out of # histogram. /= would keep int32 type when we want it to be # transformed into a float type by the division. if h.sum(): # noinspection PyAugmentAssignment h = h / float(h.sum()) else: h = numpy.zeros(h.shape, h.dtype) # self._log.debug("Normalized histogram: %s", h) else: ### # Spatial Pyramid Quantization # self._log.debug("Quantizing descriptors using spatial pyramid") ## # Quantization factor - number of nearest codes to be saved q_factor = 10 ## # Concatenating spatial information to descriptor vectors to format: # [ x y <descriptor> ] self._log.debug("Creating combined descriptor matrix") m = numpy.concatenate((info[:, :2], descriptors), axis=1) ## # Creating quantized vectors, consisting vector: # [ x y c_1 ... c_qf dist_1 ... dist_qf ] # which has a total size of 2+(qf*2) # # Sangmin's code included the distances in the quantized vector, but # then also passed this vector into numpy's histogram function with # integral bins, causing the [0,1] to be heavily populated, which # doesn't make sense to do. # idxs, dists = flann.nn_index(m[:, 2:], q_factor) # q = numpy.concatenate([m[:, :2], idxs, dists], axis=1) self._log.debug("Computing nearest neighbors") if self._flann_distance_metric == 'hik': # Query full ordering of code indices idxs = flann.nn_index(m[:, 2:], self._codebook.shape[0])[0] # Extract the right-side block for use in building histogram # Order doesn't actually matter in the current implementation # because index relative position is not being weighted. idxs = idxs[:, -q_factor:] else: idxs = flann.nn_index(m[:, 2:], q_factor)[0] self._log.debug("Creating quantization matrix") # This matrix consists of descriptor (x,y) position + near code # indices. q = numpy.concatenate([m[:, :2], idxs], axis=1) ## # Build spatial pyramid from quantized matrix self._log.debug("Building spatial pyramid histograms") hist_sp = self._build_sp_hist(q, self._codebook.shape[0]) ## # Combine each quadrants into single vector self._log.debug("Combining global+thirds into final histogram.") f = sys.float_info.min # so as we don't div by 0 accidentally def rf_norm(hist): return hist / (float(hist.sum()) + f) h = numpy.concatenate([rf_norm(hist_sp[0]), rf_norm(hist_sp[5]), rf_norm(hist_sp[6]), rf_norm(hist_sp[7])], axis=1) # noinspection PyAugmentAssignment h /= h.sum() self._log.debug("Saving checkpoint feature file") if not osp.isdir(osp.dirname(checkpoint_filepath)): file_utils.safe_create_dir(osp.dirname(checkpoint_filepath)) numpy.save(checkpoint_filepath, h) return h
def generate_model(self, data_set, **kwargs): """ Generate this feature detector's data-model given a file ingest. This saves the generated model to the currently configured data directory. For colorDescriptor, we generate raw features over the ingest data, compute a codebook via kmeans, and then create an index with FLANN via the "autotune" or linear algorithm to intelligently pick the fastest indexing method. :param data_set: Set of input data elements to generate the model with. :type data_set: collections.Set[smqtk.representation.DataElement] """ if self.has_model: self._log.warn("ColorDescriptor model for descriptor type '%s' " "already generated!", self.descriptor_type()) return # Check that input data is value for processing through colorDescriptor valid_types = self.valid_content_types() invalid_types_found = set() for di in data_set: if di.content_type() not in valid_types: invalid_types_found.add(di.content_type()) if invalid_types_found: self._log.error("Found one or more invalid content types among " "input:") for t in sorted(invalid_types_found): self._log.error("\t- '%s", t) raise ValueError("Discovered invalid content type among input " "data: %s" % sorted(invalid_types_found)) if not osp.isfile(self.codebook_filepath): self._log.info("Did not find existing ColorDescriptor codebook for " "descriptor '%s'.", self.descriptor_type()) # generate descriptors with SimpleTimer("Generating descriptor matrices...", self._log.info): descriptors_checkpoint = osp.join(self._work_dir, "model_descriptors.npy") if osp.isfile(descriptors_checkpoint): self._log.debug("Found existing computed descriptors work " "file for model generation.") descriptors = numpy.load(descriptors_checkpoint) else: self._log.debug("Computing model descriptors") _, descriptors = \ self._generate_descriptor_matrices( data_set, limit=self._model_gen_descriptor_limit ) _, tmp = tempfile.mkstemp(dir=self._work_dir, suffix='.npy') self._log.debug("Saving model-gen info/descriptor matrix") numpy.save(tmp, descriptors) os.rename(tmp, descriptors_checkpoint) # Compute centroids (codebook) with kmeans with SimpleTimer("Computing sklearn.cluster.MiniBatchKMeans...", self._log.info): kmeans_verbose = self._log.getEffectiveLevel <= logging.DEBUG kmeans = sklearn.cluster.MiniBatchKMeans( n_clusters=self._kmeans_k, init_size=self._kmeans_k*3, random_state=self._rand_seed, verbose=kmeans_verbose, compute_labels=False, ) kmeans.fit(descriptors) codebook = kmeans.cluster_centers_ with SimpleTimer("Saving generated codebook...", self._log.debug): numpy.save(self.codebook_filepath, codebook) else: self._log.info("Found existing codebook file.") codebook = numpy.load(self.codebook_filepath) # create FLANN index # - autotune will force select linear search if there are < 1000 words # in the codebook vocabulary. pyflann.set_distance_type(self._flann_distance_metric) flann = pyflann.FLANN() if self._log.getEffectiveLevel() <= logging.DEBUG: log_level = 'info' else: log_level = 'warning' with SimpleTimer("Building FLANN index...", self._log.info): p = { "target_precision": self._flann_target_precision, "sample_fraction": self._flann_sample_fraction, "log_level": log_level, } if self._flann_autotune: p['algorithm'] = "autotuned" if self._rand_seed is not None: p['random_seed'] = self._rand_seed flann_params = flann.build_index(codebook, **p) with SimpleTimer("Saving FLANN index to file...", self._log.debug): # Save FLANN index data binary flann.save_index(self.flann_index_filepath) # Save out log of parameters with open(self.flann_params_filepath, 'w') as ofile: json.dump(flann_params, ofile, indent=4, sort_keys=True) # save generation results to class for immediate feature computation use self._codebook = codebook
def _compute_descriptor(self, data): """ Given some kind of data, process and return a feature vector as a Numpy array. :raises RuntimeError: Feature extraction failure of some kind. :param data: Some kind of input data for the feature descriptor. This is descriptor dependent. :type data: smqtk.data_rep.DataElement :return: Feature vector. This is a histogram of N bins where N is the number of centroids in the codebook. Bin values is percent composition, not absolute counts. :rtype: numpy.ndarray """ super(ColorDescriptor_Base, self)._compute_descriptor(data) checkpoint_filepath = self._get_checkpoint_feature_file(data) # if osp.isfile(checkpoint_filepath): # return numpy.load(checkpoint_filepath) if not self.has_model: raise RuntimeError("No model currently loaded! Check the existence " "or, or generate, model files!\n" "Codebook path: %s\n" "FLANN Index path: %s" % (self.codebook_filepath, self.flann_index_filepath)) self.log.debug("Computing descriptors for data UID[%s]...", data.uuid()) info, descriptors = self._generate_descriptor_matrices({data}) if not self._use_sp: ### # Codebook Quantization # # - loaded the model at class initialization if we had one self.log.debug("Quantizing descriptors") pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION) flann = pyflann.FLANN() flann.load_index(self.flann_index_filepath, self._codebook) try: idxs, dists = flann.nn_index(descriptors) except AssertionError: self.log.error("Codebook shape : %s", self._codebook.shape) self.log.error("Descriptor shape: %s", descriptors.shape) raise # Create histogram # - Using explicit bin slots to prevent numpy from automatically # creating tightly constrained bins. This would otherwise cause # histograms between two inputs to be non-comparable (unaligned # bins). # - See numpy note about ``bins`` to understand why the +1 is # necessary # - Learned from spatial implementation that we could feed multiple # neighbors per descriptor into here, leading to a more populated # histogram. # - Could also possibly weight things based on dist from # descriptor? #: :type: numpy.core.multiarray.ndarray h = numpy.histogram(idxs, # indices are all integers bins=numpy.arange(self._codebook.shape[0]+1))[0] # self.log.debug("Quantization histogram: %s", h) # Normalize histogram into relative frequencies # - Not using /= on purpose. h is originally int32 coming out of # histogram. /= would keep int32 type when we want it to be # transformed into a float type by the division. if h.sum(): # noinspection PyAugmentAssignment h = h / float(h.sum()) else: h = numpy.zeros(h.shape, h.dtype) # self.log.debug("Normalized histogram: %s", h) else: ### # Spatial Pyramid Quantization # self.log.debug("Quantizing descriptors using spatial pyramid") ## # Quantization factor - number of nearest codes to be saved q_factor = 10 ## # Concatenating spatial information to descriptor vectors to format: # [ x y <descriptor> ] self.log.debug("Creating combined descriptor matrix") m = numpy.concatenate((info[:, :2], descriptors), axis=1) ## # Creating quantized vectors, consisting vector: # [ x y c_1 ... c_qf dist_1 ... dist_qf ] # which has a total size of 2+(qf*2) # # Sangmin's code included the distances in the quantized vector, but # then also passed this vector into numpy's histogram function with # integral bins, causing the [0,1] to be heavily populated, which # doesn't make sense to do. # idxs, dists = flann.nn_index(m[:, 2:], q_factor) # q = numpy.concatenate([m[:, :2], idxs, dists], axis=1) self.log.debug("Computing nearest neighbors") pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION) flann = pyflann.FLANN() flann.load_index(self.flann_index_filepath, self._codebook) idxs = flann.nn_index(m[:, 2:], q_factor)[0] self.log.debug("Creating quantization matrix") q = numpy.concatenate([m[:, :2], idxs], axis=1) ## # Build spatial pyramid from quantized matrix self.log.debug("Building spatial pyramid histograms") hist_sp = self._build_sp_hist(q, self._codebook.shape[0]) ## # Combine each quadrants into single vector self.log.debug("Combining global+thirds into final histogram.") f = sys.float_info.min # so as we don't div by 0 accidentally rf_norm = lambda h: h / (float(h.sum()) + f) h = numpy.concatenate([rf_norm(hist_sp[0]), rf_norm(hist_sp[5]), rf_norm(hist_sp[6]), rf_norm(hist_sp[7])], axis=1) # noinspection PyAugmentAssignment h /= h.sum() self.log.debug("Saving checkpoint feature file") if not osp.isdir(osp.dirname(checkpoint_filepath)): safe_create_dir(osp.dirname(checkpoint_filepath)) numpy.save(checkpoint_filepath, h) return h
def _build_index(self, descriptors): """ Internal method to be implemented by sub-classes to build the index with the given descriptor data elements. Subsequent calls to this method should rebuild the current index. This method shall not add to the existing index nor raise an exception to as to protect the current index. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ with self._model_lock: # Not caring about restoring the index because we're just making a # new one. self._log.info("Building new FLANN index") self._log.debug("Caching descriptor elements") self._descr_cache = list(descriptors) # Cache descriptors if we have an element if self._descr_cache_elem and self._descr_cache_elem.writable(): self._log.debug("Caching descriptors: %s", self._descr_cache_elem) self._descr_cache_elem.set_bytes( cPickle.dumps(self._descr_cache, -1) ) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning") } if self._build_autotune: params['algorithm'] = "autotuned" if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._log.debug("Accumulating descriptor vectors into matrix for " "FLANN") pts_array = elements_to_matrix(self._descr_cache, report_interval=1.0) self._log.debug('Building FLANN index') self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index(pts_array, **params) del pts_array if self._index_elem and self._index_elem.writable(): self._log.debug("Caching index: %s", self._index_elem) # FLANN wants to write to a file, so make a temp file, then # read it in, putting bytes into element. fd, fp = tempfile.mkstemp() try: self._flann.save_index(fp) # Use the file descriptor to create the file object. # This avoids reopening the file and will automatically # close the file descriptor on exiting the with block. # fdopen() is required because in Python 2 open() does # not accept a file descriptor. with os.fdopen(fd, 'rb') as f: self._index_elem.set_bytes(f.read()) finally: os.remove(fp) if self._index_param_elem and self._index_param_elem.writable(): self._log.debug("Caching index params: %s", self._index_param_elem) state = { 'b_autotune': self._build_autotune, 'b_target_precision': self._build_target_precision, 'b_sample_frac': self._build_sample_frac, 'distance_method': self._distance_method, 'flann_build_params': self._flann_build_params, } self._index_param_elem.set_bytes(cPickle.dumps(state, -1)) self._pid = multiprocessing.current_process().pid
def generate_model(self, data_set, **kwargs): """ Generate this feature detector's data-model given a file ingest. This saves the generated model to the currently configured data directory. For colorDescriptor, we generate raw features over the ingest data, compute a codebook via kmeans, and then create an index with FLANN via the "autotune" algorithm to intelligently pick the fastest indexing method. :param num_elements: Number of data elements in the iterator :type num_elements: int :param data_set: Set of input data elements to generate the model with. :type data_set: collections.Set[smqtk.data_rep.DataElement] """ super(ColorDescriptor_Base, self).generate_model(data_set, **kwargs) if self.has_model: self.log.warn("ColorDescriptor model for descriptor type '%s' " "already generated!", self.descriptor_type()) return pyflann.set_distance_type(self.FLANN_DISTANCE_FUNCTION) flann = pyflann.FLANN() if not osp.isfile(self.codebook_filepath): self.log.info("Did not find existing ColorDescriptor codebook for " "descriptor '%s'.", self.descriptor_type()) # generate descriptors with SimpleTimer("Generating descriptor matrices...", self.log.info): descriptors_checkpoint = osp.join(self._work_dir, "model_descriptors.npy") if osp.isfile(descriptors_checkpoint): self.log.debug("Found existing computed descriptors work " "file for model generation.") descriptors = numpy.load(descriptors_checkpoint) else: self.log.debug("Computing model descriptors") _, descriptors = \ self._generate_descriptor_matrices( data_set, limit=self.CODEBOOK_DESCRIPTOR_LIMIT ) _, tmp = tempfile.mkstemp(dir=self._work_dir, suffix='.npy') self.log.debug("Saving model-gen info/descriptor matrix") numpy.save(tmp, descriptors) os.rename(tmp, descriptors_checkpoint) # Compute centroids (codebook) with kmeans with SimpleTimer("Computing sklearn.cluster.MiniBatchKMeans...", self.log.info): kmeans_verbose = self.log.getEffectiveLevel <= logging.DEBUG kmeans = sklearn.cluster.MiniBatchKMeans( n_clusters=self._kmeans_k, init_size=self._kmeans_k*3, random_state=self._rand_seed, verbose=kmeans_verbose, compute_labels=False, ) kmeans.fit(descriptors) codebook = kmeans.cluster_centers_ with SimpleTimer("Saving generated codebook...", self.log.debug): numpy.save(self.codebook_filepath, codebook) else: self.log.info("Found existing codebook file.") codebook = numpy.load(self.codebook_filepath) # create FLANN index # - autotune will force select linear search if there are < 1000 words # in the codebook vocabulary. if self.log.getEffectiveLevel() <= logging.DEBUG: log_level = 'info' else: log_level = 'warning' with SimpleTimer("Building FLANN index...", self.log.info): p = { "target_precision": self._flann_target_precision, "sample_fraction": self._flann_sample_fraction, "log_level": log_level, "algorithm": "autotuned" } if self._rand_seed is not None: p['random_seed'] = self._rand_seed flann_params = flann.build_index(codebook, **p) with SimpleTimer("Saving FLANN index to file...", self.log.debug): # Save FLANN index data binary flann.save_index(self.flann_index_filepath) # Save out log of parameters with open(self.flann_params_filepath, 'w') as ofile: json.dump(flann_params, ofile, indent=4, sort_keys=True) # save generation results to class for immediate feature computation use self._codebook = codebook
index_gensim.num_best = TOP_N logger.info("finished gensim index %s" % index_gensim) logger.info("loading mapping between article titles and ids") id2title = gensim.utils.unpickle(os.path.join(indir, 'id2title')) title2id = dict((title.lower(), pos) for pos, title in enumerate(id2title)) # print_similar('Anarchism', index_gensim, id2title, title2id) if 'gensim' in program: # log_precision(gensim_predictions, index_gensim, queries, index_gensim) gensim_at_once(index_gensim, queries) gensim_1by1(index_gensim, queries) if 'flann' in program: import pyflann pyflann.set_distance_type('euclidean') index_flann = pyflann.FLANN() flann_fname = sim_prefix + "_flann_%s" % ACC if os.path.exists(flann_fname): logger.info("loading flann index") index_flann.load_index(flann_fname, clipped) else: logger.info("building FLANN index") # flann expects index vectors as a 2d numpy array, features = columns params = index_flann.build_index(clipped, **ACC_SETTINGS['flann'][ACC]) logger.info("built flann index with %s" % params) index_flann.save_index(flann_fname) logger.info("finished FLANN index") log_precision(flann_predictions, index_flann, queries, index_gensim) flann_1by1(index_flann, queries)
import sys import numpy #import pylab import marsyas import marsyas_util import pdb import pyflann from pyflann import * from numpy import * #from numpy.random import * import cPickle import random #PLOT = True PLOT = False pyflann.set_distance_type('kl') flann = FLANN() topn = 20 buffsize = 512 #texture = ["Rms/rms", "AubioYin/pitcher","ZeroCrossings/zcrs" ,"Series/lspbranch" ,"Series/lpccbranch" ,"MFCC/mfcc" ,"SCF/scf" ,"Rolloff/rf" ,"Flux/flux" ,"Centroid/cntrd" ,"Series/chromaPrSeries"] texture = ["Rms/rms", "AubioYin/pitcher","ZeroCrossings/zcrs" ,"Rolloff/rf" ,"Flux/flux" ,"Centroid/cntrd","AbsMax/abs","Energy/energy"] #"AimGammatone/aimgamma"] detectors = ["Fanout/detectors", texture] grainuri = "RealvecGrainSource/real_src" class Slice: stats = []
def __init__(self, Xin, NNtype='knn', use_flann=False, center=True, rescale=True, k=10, sigma=None, epsilon=0.01, plotting={}, symmetrize_type='average', dist_type='euclidean', order=0, **kwargs): self.Xin = Xin self.NNtype = NNtype self.use_flann = use_flann self.center = center self.rescale = rescale self.k = k self.sigma = sigma self.epsilon = epsilon self.symmetrize_type = symmetrize_type self.dist_type = dist_type self.order = order N, d = np.shape(self.Xin) Xout = self.Xin if k >= N: raise ValueError('The number of neighbors (k={}) must be smaller ' 'than the number of nodes ({}).'.format(k, N)) if self.center: Xout = self.Xin - np.kron(np.ones((N, 1)), np.mean(self.Xin, axis=0)) if self.rescale: bounding_radius = 0.5 * np.linalg.norm(np.amax(Xout, axis=0) - np.amin(Xout, axis=0), 2) scale = np.power(N, 1. / float(min(d, 3))) / 10. Xout *= scale / bounding_radius # Translate distance type string to corresponding Minkowski order. dist_translation = {"euclidean": 2, "manhattan": 1, "max_dist": np.inf, "minkowski": order } if self.NNtype == 'knn': spi = np.zeros((N * k)) spj = np.zeros((N * k)) spv = np.zeros((N * k)) if self.use_flann: pfl = _import_pfl() pfl.set_distance_type(dist_type, order=order) flann = pfl.FLANN() # Default FLANN parameters (I tried changing the algorithm and # testing performance on huge matrices, but the default one # seems to work best). NN, D = flann.nn(Xout, Xout, num_neighbors=(k + 1), algorithm='kdtree') else: kdt = spatial.KDTree(Xout) D, NN = kdt.query(Xout, k=(k + 1), p=dist_translation[dist_type]) if self.sigma is None: self.sigma = np.mean(D[:, 1:]) # Discard distance to self. for i in range(N): spi[i * k:(i + 1) * k] = np.kron(np.ones((k)), i) spj[i * k:(i + 1) * k] = NN[i, 1:] spv[i * k:(i + 1) * k] = np.exp(-np.power(D[i, 1:], 2) / float(self.sigma)) elif self.NNtype == 'radius': kdt = spatial.KDTree(Xout) D, NN = kdt.query(Xout, k=None, distance_upper_bound=epsilon, p=dist_translation[dist_type]) if self.sigma is None: # Discard distance to self. self.sigma = np.mean([np.mean(d[1:]) for d in D]) count = 0 for i in range(N): count = count + len(NN[i]) spi = np.zeros((count)) spj = np.zeros((count)) spv = np.zeros((count)) start = 0 for i in range(N): leng = len(NN[i]) - 1 spi[start:start + leng] = np.kron(np.ones((leng)), i) spj[start:start + leng] = NN[i][1:] spv[start:start + leng] = np.exp(-np.power(D[i][1:], 2) / float(self.sigma)) start = start + leng else: raise ValueError('Unknown NNtype {}'.format(self.NNtype)) W = sparse.csc_matrix((spv, (spi, spj)), shape=(N, N)) # Sanity check if np.shape(W)[0] != np.shape(W)[1]: raise ValueError('Weight matrix W is not square') # Enforce symmetry. Note that checking symmetry with # np.abs(W - W.T).sum() is as costly as the symmetrization itself. W = utils.symmetrize(W, method=symmetrize_type) super(NNGraph, self).__init__(W, plotting=plotting, coords=Xout, **kwargs)