def main(): args = cli_parser().parse_args() config = utility_main_helper(default_config, args) log = logging.getLogger(__name__) output_filepath = args.output_map if not output_filepath: raise ValueError("No path given for output map file (pickle).") #: :type: smqtk.representation.DescriptorIndex index = from_plugin_config(config['descriptor_index'], get_descriptor_index_impls()) mbkm = MiniBatchKMeans(verbose=args.verbose, compute_labels=False, **config['minibatch_kmeans_params']) initial_fit_size = int(config['initial_fit_size']) d_classes = mb_kmeans_build_apply(index, mbkm, initial_fit_size) log.info("Saving KMeans centroids to: %s", config['centroids_output_filepath_npy']) numpy.save(config['centroids_output_filepath_npy'], mbkm.cluster_centers_) log.info("Saving result classification map to: %s", output_filepath) safe_create_dir(os.path.dirname(output_filepath)) with open(output_filepath, 'w') as f: cPickle.dump(d_classes, f, -1) log.info("Done")
def get_current_iqr_session(self): """ Get the current IQR Session UUID. :rtype: str """ sid = str(flask.session.sid) # Ensure there is an initialized session on the configured service. created_session = False get_r = self._iqr_service.get('session_ids') get_r.raise_for_status() if sid not in get_r.json()['session_uuids']: post_r = self._iqr_service.post('session', sid=sid) post_r.raise_for_status() created_session = True if created_session or (sid not in self._iqr_work_dirs): # Dictionaries not initialized yet for this UUID. self._iqr_work_dirs[sid] = osp.join(self.work_dir, sid) self._iqr_example_data[sid] = {} safe_create_dir(self._iqr_work_dirs[sid]) return sid
def _write_file_chunks(self, chunk_map, file_extension=''): """ Given a mapping of chunks, write their contents to a temporary file, returning the path to that file. Returned file path should be manually removed by the user. :param chunk_map: Mapping of integer index to file-like chunk :type chunk_map: dict of (int, StringIO) :param file_extension: String extension to suffix the temporary file with :type file_extension: str :raises OSError: OS problems creating temporary file or writing it out. :return: Path to temporary combined file :rtype: str """ # Make sure write dir exists... if not os.path.isdir(self.working_dir): file_utils.safe_create_dir(self.working_dir) tmp_fd, tmp_path = tempfile.mkstemp(file_extension, dir=self.working_dir) self._log.debug("Combining chunks into temporary file: %s", tmp_path) tmp_file = open(tmp_path, 'wb') for idx, chunk in sorted(chunk_map.items(), key=lambda p: p[0]): data = chunk.read() tmp_file.write(data) tmp_file.close() return tmp_path
def dl_image(meta): try: c_type = meta['fields']['content_type'][0] obj_stored_url = meta['fields']['obj_stored_url'][0] obj_original_url = meta['fields']['obj_original_url'][0] c_ext = m.guess_extension(c_type, strict=False) if c_ext is None: log.warn("Guessed 'None' extension for content-type '%s', " "skipping.", c_type) return None save_dir = os.path.abspath(os.path.expanduser( os.path.join(output_dir, meta['index'], meta['doc_type']) )) save_file = meta['id'] + c_ext save_path = os.path.join(save_dir, save_file) # Save/write file if needed if not os.path.isfile(save_path): # First try 'stored' url, fallback on original # Return None if failed to download anything ok, r = try_download(obj_stored_url, stored_http_auth) if not ok: log.warn("Failed to download stored-data URL \"%s\" " "(error=%s)", obj_stored_url, str(r)) ok, r = try_download(obj_original_url) if not ok: log.warn("Failed to download original URL \"%s\" " "(error=%s)", obj_stored_url, str(r)) return None # Assuming OK at this point content = r.content d = DataMemoryElement(content, c_type) safe_create_dir(save_dir) with open(save_path, 'wb') as out: log.debug("Saving to file: '%s'", save_path) out.write(content) else: d = DataFileElement(save_path) return meta['id'], save_path, d.uuid() except KeyError, ex: log.error("Failed to find key %s in meta block: %s", str(ex), meta) raise
def set_vector(self, new_vec): """ Set the contained vector. If this container already stores a descriptor vector, this will overwrite it. :param new_vec: New vector to contain. :type new_vec: numpy.core.multiarray.ndarray """ file_utils.safe_create_dir(osp.dirname(self._vec_filepath)) numpy.save(self._vec_filepath, new_vec)
def main(): description = """ Utility for fetching remotely stored image paths from the JPL Solr index. Files will be transferred with their entire containing directories. For example, if the file was stored in "/data/things/image.png" remotely, it will be transferred locally to "<output_dir>/data/things/image.png". Assumptions: - JPL MEMEX Solr index key structure - `id` == "file:<abs-filepath>" - `mainType` is the first component of the MIMETYPE - `indexedAt` timestamp """ args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) paths_file = args.paths_file after_time = args.after_time before_time = args.before_time # # Check dir/file locations # if paths_file is None: raise ValueError("Need a file path to to output transferred file " "paths!") file_utils.safe_create_dir(os.path.dirname(paths_file)) # # Start collection # remote_paths = solr_image_paths( config['solr_address'], after_time or '*', before_time or '*', config['solr_username'], config['solr_password'], config['batch_size'] ) log.info("Writing file paths") s = [0] * 7 with open(paths_file, 'w') as of: for rp in remote_paths: of.write(rp + '\n') bin_utils.report_progress(log.info, s, 1.) # Final report s[1] -= 1 bin_utils.report_progress(log.info, s, 0)
def _get_checkpoint_dir(self, data): """ The directory that contains checkpoint material for a given data element :param data: Data element :type data: smqtk.representation.DataElement :return: directory path :rtype: str """ d = osp.join(self._work_dir, *partition_string(str(data.uuid()), 10)) file_utils.safe_create_dir(d) return d
def add_data(self, *elems): """ Add the given data element(s) instance to this data set. :param elems: Data element(s) to add :type elems: list[smqtk.representation.DataElement] """ for e in elems: assert isinstance(e, DataElement) uuid = str(e.uuid()) fp = self._fp_for_uuid(uuid) file_utils.safe_create_dir(osp.dirname(fp)) with open(fp, 'wb') as f: cPickle.dump(e, f) self._log.debug("Wrote out element %s", e)
def write_temp(d): """ Returns path to file written. Always creates new file. """ if d: file_utils.safe_create_dir(d) ext = MIMETYPES.guess_extension(self.content_type()) # Exceptions because mimetypes is apparently REALLY OLD if ext in {'.jpe', '.jfif'}: ext = '.jpg' fd, fp = tempfile.mkstemp( suffix=ext, dir=d ) os.close(fd) with open(fp, 'wb') as f: f.write(self.get_bytes()) return fp
def set_vector(self, new_vec): """ Set the contained vector. If this container already stores a descriptor vector, this will overwrite it. :param new_vec: New vector to contain. :type new_vec: numpy.core.multiarray.ndarray :returns: Self. :rtype: DescriptorFileElement """ file_utils.safe_create_dir(osp.dirname(self._vec_filepath)) numpy.save(self._vec_filepath, new_vec) return self
def reset_iqr_session(): """ Reset the current IQR session """ with self.get_current_iqr_session() as iqrs: iqrs.reset() # Clearing working directory if os.path.isdir(self._iqr_work_dirs[iqrs.uuid]): shutil.rmtree(self._iqr_work_dirs[iqrs.uuid]) safe_create_dir(self._iqr_work_dirs[iqrs.uuid]) # Clearing example data + descriptors self._iqr_example_data[iqrs.uuid].clear() self._iqr_example_pos_descr[iqrs.uuid].clear() return flask.jsonify({"success": True})
def set_classification(self, m=None, **kwds): """ Set the whole classification map for this element. This will strictly overwrite the entire label-confidence mapping (vs. updating it) Label/confidence values may either be provided via keyword arguments or by providing a dictionary mapping labels to confidence values. :param m: New labels-to-confidence mapping to set. :type m: dict[collections.Hashable, float] :raises ValueError: The given label-confidence map was empty. """ m = super(FileClassificationElement, self)\ .set_classification(m, **kwds) file_utils.safe_create_dir(osp.dirname(self.filepath)) with open(self.filepath, 'w') as f: cPickle.dump(m, f, self.pickle_protocol)
def reset_iqr_session(): """ Reset the current IQR session """ with self.get_current_iqr_session() as iqrs: iqrs.reset() # Clearing working directory if os.path.isdir(self._iqr_work_dirs[iqrs.uuid]): shutil.rmtree(self._iqr_work_dirs[iqrs.uuid]) safe_create_dir(self._iqr_work_dirs[iqrs.uuid]) # Clearing example data + descriptors self._iqr_example_data[iqrs.uuid].clear() self._iqr_example_pos_descr[iqrs.uuid].clear() return flask.jsonify({ "success": True })
def get_current_iqr_session(self): """ Get the current IQR Session instance. :rtype: smqtk.IQR.iqr_session.IqrSession """ with self._iqr_controller: sid = flask.session.sid if not self._iqr_controller.has_session_uuid(sid): iqr_sess = IqrSession(self._pos_seed_neighbors, self._rel_index_config, sid) self._iqr_controller.add_session(iqr_sess, sid) self._iqr_work_dirs[iqr_sess.uuid] = \ osp.join(self.work_dir, sid) safe_create_dir(self._iqr_work_dirs[iqr_sess.uuid]) self._iqr_example_data[iqr_sess.uuid] = {} self._iqr_example_pos_descr[iqr_sess.uuid] = {} return self._iqr_controller.get_session(sid)
def _save_mrpt_model(self): self._log.debug("Caching index and parameters: %s, %s", self._index_filepath, self._index_param_filepath) if self._index_filepath: self._log.debug("Caching index: %s", self._index_filepath) safe_create_dir(osp.dirname(self._index_filepath)) # noinspection PyTypeChecker with open(self._index_filepath, "wb") as f: pickle.dump(self._trees, f, self._pickle_protocol) if self._index_param_filepath: self._log.debug("Caching index params: %s", self._index_param_filepath) safe_create_dir(osp.dirname(self._index_param_filepath)) params = { "read_only": self._read_only, "num_trees": self._num_trees, "depth": self._depth, } # noinspection PyTypeChecker with open(self._index_param_filepath, "w") as f: pickle.dump(params, f, self._pickle_protocol)
def get_preview_image(self, elem): """ Get the filepath to the preview image for the given data element. :raises ValueError: Do not know how to generate a preview image for the given element's content type. :param elem: Data element to generate a preview image for. :type elem: smqtk.representation.DataElement :return: Path to the preview image for the given data element. :rtype: str """ if elem.uuid() in self._preview_cache: return self._preview_cache[elem.uuid()] # else, generate preview image based on content type / content class if elem.content_type() in self.PREVIEW_GEN_METHOD: self._log.debug( "Generating preview image based on content type: " "%s", elem.content_type) file_utils.safe_create_dir(self._cache_dir) fp = self.PREVIEW_GEN_METHOD[elem.content_type()](elem, self._cache_dir) else: content_class = elem.content_type().split('/', 1)[0] if content_class in self.PREVIEW_GEN_METHOD: self._log.debug( "Generating preview image based on content " "class: %s", content_class) file_utils.safe_create_dir(self._cache_dir) fp = self.PREVIEW_GEN_METHOD[content_class](elem, self._cache_dir) else: raise ValueError("No preview generation method for the data " "element provided, of content type '%s'." % elem.content_type()) self._preview_cache[elem.uuid()] = fp return fp
def test_existError_alreadyExists(self, mock_os_makedirs, mock_osp_exists): mock_os_makedirs.side_effect = OSError(errno.EEXIST, "Existing directory") mock_osp_exists.return_value = True dir_path = '/existing/dir' p = file_utils.safe_create_dir(dir_path) ntools.assert_true(mock_os_makedirs.called) ntools.assert_true(mock_osp_exists.called) mock_osp_exists.assert_called_once_with(dir_path) ntools.assert_equal(p, dir_path)
def reset_session_local(self, sid): """ Reset elements of this server for a given session ID. A given ``sid`` must have been created first. This happens in the ``get_current_iqr_session`` method. This does not affect the linked IQR service. :param sid: Session ID to reset for. :type sid: str :raises KeyError: ``sid`` not recognized. Probably not initialized first. """ # Also clear work sub-directory and example data state if os.path.isdir(self._iqr_work_dirs[sid]): shutil.rmtree(self._iqr_work_dirs[sid]) safe_create_dir(self._iqr_work_dirs[sid]) self._iqr_example_data[sid].clear()
def get_preview_image(self, elem): """ Get the filepath to the preview image for the given data element. :raises ValueError: Do not know how to generate a preview image for the given element's content type. :param elem: Data element to generate a preview image for. :type elem: smqtk.representation.DataElement :return: Path to the preview image for the given data element. :rtype: str """ if elem.uuid() in self._preview_cache: return self._preview_cache[elem.uuid()] # else, generate preview image based on content type / content class if elem.content_type() in self.PREVIEW_GEN_METHOD: self._log.debug("Generating preview image based on content type: " "%s", elem.content_type) file_utils.safe_create_dir(self._cache_dir) fp = self.PREVIEW_GEN_METHOD[elem.content_type()](self, elem, self._cache_dir) else: content_class = elem.content_type().split('/', 1)[0] if content_class in self.PREVIEW_GEN_METHOD: self._log.debug("Generating preview image based on content " "class: %s", content_class) file_utils.safe_create_dir(self._cache_dir) fp = self.PREVIEW_GEN_METHOD[content_class](self, elem, self._cache_dir) else: raise ValueError("No preview generation method for the data " "element provided, of content type '%s'." % elem.content_type()) self._preview_cache[elem.uuid()] = fp return fp
def _write_new_temp(self, d): """ Actually write our bytes to a new temp file Always creates new file. :param d: directory to write temp file in or None to use system default. :returns: path to file written """ if d: file_utils.safe_create_dir(d) ext = MIMETYPES.guess_extension(self.content_type() or '') # Exceptions because mimetypes is apparently REALLY OLD if ext in {'.jpe', '.jfif'}: ext = '.jpg' fd, fp = tempfile.mkstemp( suffix=ext or '', dir=d ) os.close(fd) with open(fp, 'wb') as f: f.write(self.get_bytes()) return fp
def _write_new_temp(self, d): """ Actually write our bytes to a new temp file Always creates new file. :param d: directory to write temp file in or None to use system default. :returns: path to file written """ if d: file_utils.safe_create_dir(d) ext = MIMETYPES.guess_extension(self.content_type()) # Exceptions because mimetypes is apparently REALLY OLD if ext in {'.jpe', '.jfif'}: ext = '.jpg' fd, fp = tempfile.mkstemp( suffix=ext or '', dir=d ) os.close(fd) with open(fp, 'wb') as f: f.write(self.get_bytes()) return fp
def set_classification(self, m=None, **kwds): """ Set the whole classification map for this element. This will strictly overwrite the entire label-confidence mapping (vs. updating it) Label/confidence values may either be provided via keyword arguments or by providing a dictionary mapping labels to confidence values. The sum of all confidence values, must be ``1.0`` (e.g. input cannot be empty). Due to possible floating point error, we round to the 9-th decimal digit. :param m: New labels-to-confidence mapping to set. :type m: dict[collections.Hashable, float] :raises ValueError: The given label-confidence map was empty or values did no sum to ``1.0``. """ m = super(FileClassificationElement, self)\ .set_classification(m, **kwds) file_utils.safe_create_dir(osp.dirname(self.filepath)) with open(self.filepath, 'w') as f: cPickle.dump(m, f)
def build_index(self, descriptors): """ Build the index over the descriptors data elements. Subsequent calls to this method should rebuild the index, not add to it. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptors elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # Not caring about restoring the index because we're just making a new # one self._log.info("Building new FLANN index") self._log.debug("Storing descriptors") self._descr_cache = list(descriptors) if not self._descr_cache: raise ValueError("No data provided in given iterable.") # Cache descriptors if we have a path if self._descr_cache_filepath: self._log.debug("Caching descriptors: %s", self._descr_cache_filepath) safe_create_dir(osp.dirname(self._descr_cache_filepath)) with open(self._descr_cache_filepath, "wb") as f: cPickle.dump(self._descr_cache, f) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning"), } if self._build_autotune: params["algorithm"] = "autotuned" if self._rand_seed is not None: params["random_seed"] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._log.debug("Accumulating descriptor vectors into matrix for FLANN") pts_array = [d.vector() for d in self._descr_cache] pts_array = numpy.array(pts_array, dtype=pts_array[0].dtype) self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index(pts_array, **params) del pts_array self._log.debug("Caching index and state: %s, %s", self._index_filepath, self._index_param_filepath) if self._index_filepath: self._log.debug("Caching index: %s", self._index_filepath) safe_create_dir(osp.dirname(self._index_filepath)) self._flann.save_index(self._index_filepath) if self._index_param_filepath: self._log.debug("Caching index params: %s", self._index_param_filepath) state = { "b_autotune": self._build_autotune, "b_target_precision": self._build_target_precision, "b_sample_frac": self._build_sample_frac, "distance_method": self._distance_method, "flann_build_params": self._flann_build_params, } safe_create_dir(osp.dirname(self._index_param_filepath)) with open(self._index_param_filepath, "w") as f: cPickle.dump(state, f) self._pid = multiprocessing.current_process().pid
def codebook_filepath(self): file_utils.safe_create_dir(self._model_dir) return osp.join(self._model_dir, "%s.codebook.npy" % (self.descriptor_type(), ))
def test_noExists(self, mock_os_makedirs): dir_path = "/some/directory/somewhere" p = file_utils.safe_create_dir(dir_path) ntools.assert_true(mock_os_makedirs.called) ntools.assert_equals(p, dir_path)
def temp_dir(self): return file_utils.safe_create_dir(osp.join(self._work_dir, 'temp_files'))
def build_index(self, descriptors): """ Build the index over the descriptors data elements. Subsequent calls to this method should rebuild the index, not add to it. Implementation Notes: - We keep a cache file serialization around for our index in case sub-processing occurs so as to be able to recover from the underlying C data not being there. This could cause issues if a main or child process rebuild's the index, as we clear the old cache away. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptors elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # Not caring about restoring the index because we're just making a new # one self._log.info("Building new FLANN index") self._log.debug("Storing descriptors") self._descr_cache = list(descriptors) if not self._descr_cache: raise ValueError("No data provided in given iterable.") # Cache descriptors if we have a path if self._descr_cache_filepath: self._log.debug("Caching descriptors: %s", self._descr_cache_filepath) safe_create_dir(osp.dirname(self._descr_cache_filepath)) with open(self._descr_cache_filepath, 'wb') as f: cPickle.dump(self._descr_cache, f, -1) params = { "target_precision": self._build_target_precision, "sample_fraction": self._build_sample_frac, "log_level": ("info" if self._log.getEffectiveLevel() <= logging.DEBUG else "warning") } if self._build_autotune: params['algorithm'] = "autotuned" if self._rand_seed is not None: params['random_seed'] = self._rand_seed pyflann.set_distance_type(self._distance_method) self._log.debug( "Accumulating descriptor vectors into matrix for FLANN") pts_array = elements_to_matrix(self._descr_cache, report_interval=1.0) self._log.debug('Building FLANN index') self._flann = pyflann.FLANN() self._flann_build_params = self._flann.build_index(pts_array, **params) del pts_array self._log.debug("Caching index and state: %s, %s", self._index_filepath, self._index_param_filepath) if self._index_filepath: self._log.debug("Caching index: %s", self._index_filepath) safe_create_dir(osp.dirname(self._index_filepath)) self._flann.save_index(self._index_filepath) if self._index_param_filepath: self._log.debug("Caching index params: %s", self._index_param_filepath) state = { 'b_autotune': self._build_autotune, 'b_target_precision': self._build_target_precision, 'b_sample_frac': self._build_sample_frac, 'distance_method': self._distance_method, 'flann_build_params': self._flann_build_params, } safe_create_dir(osp.dirname(self._index_param_filepath)) with open(self._index_param_filepath, 'w') as f: cPickle.dump(state, f, -1) self._pid = multiprocessing.current_process().pid
def save_plt(output_dir, file_name, show): file_utils.safe_create_dir(output_dir) save_path = os.path.join(output_dir, file_name) plt.savefig(save_path) if show: plt.show()
def build_index(self, descriptors): """ Build the index over the descriptor data elements. The first part of this method is equivalent to the compressITQ function from UNC-CH's implementation. :raises RuntimeError: A current data model is loaded, or the current CodeIndex is not empty. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # Halt if we are going to overwrite a loaded mean/rotation cache. if not (self._mean_vector is None and self._r is None): raise RuntimeError("Current ITQ model is not empty (cached mean / " "rotation). For the sake of protecting data, we " "are not proceeding.") # Halt if the code index currently isn't empty if self.count(): raise RuntimeError("Current CodeIndex instance is not empty. For " "the sake of protecting data, we are not " "proceeding.") self._log.debug("Using %d length bit-vectors", self._bit_len) # TODO: Sub-sample down descriptors to use for PCA + ITQ # - Harry was also working on an iterative training approach so # that we only have to have a limited number of vectors in # memory at a time. if self._rand_seed: numpy.random.seed(self._rand_seed) with SimpleTimer("Creating descriptor cache", self._log.info): #: :type: list[smqtk.representation.DescriptorElement] descr_cache = [] for d in descriptors: descr_cache.append(d) if not descr_cache: raise ValueError("No descriptors given!") with SimpleTimer("Creating matrix of descriptors for training", self._log.info): # Get non-memory vectors on separate processes and aggregate into # matrix. self._log.debug("Input elements: %d", len(descr_cache)) self._log.debug("Input elem size: %s", descr_cache[0].vector().size) dbg_report_interval = None if self.logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds x = elements_to_matrix(descr_cache, report_interval=dbg_report_interval) self._log.debug("descriptor matrix shape: %s", x.shape) with SimpleTimer("Centering data", self._log.info): # center the data, VERY IMPORTANT for ITQ to work self._mean_vector = numpy.mean(x, axis=0) x -= self._mean_vector if self._mean_vec_cache_filepath: with SimpleTimer("Saving mean vector", self._log.info): file_utils.safe_create_dir(osp.dirname(self._mean_vec_cache_filepath)) numpy.save(self._mean_vec_cache_filepath, self._mean_vector) # PCA with SimpleTimer("Computing PCA transformation", self._log.info): # numpy and matlab observation format is flipped, thus added # transpose self._log.debug("-- computing covariance") c = numpy.cov(x.transpose()) # Direct translation # - eigen vectors are the columns of ``pc`` self._log.debug('-- computing linalg.eig') l, pc = numpy.linalg.eig(c) # ordered by greatest eigenvalue magnitude, keeping top ``bit_len`` self._log.debug('-- computing top pairs') top_pairs = sorted(zip(l, pc.transpose()), key=lambda p: p[0], reverse=1 )[:self._bit_len] # # Harry translation -- Uses singular values / vectors, not eigen # # - singular vectors are the rows of pc # pc, l, _ = numpy.linalg.svd(c) # top_pairs = sorted(zip(l, pc), # key=lambda p: p[0], # reverse=1 # )[:self._bit_len] # Eigen-vectors of top ``bit_len`` magnitude eigenvalues self._log.debug("-- top vector extraction") pc_top = numpy.array([p[1] for p in top_pairs]).transpose() self._log.debug("-- transform centered data by PC matrix") xx = numpy.dot(x, pc_top) # ITQ to find optimal rotation. # `c` is the output codes for matrix `x` # `r` is the rotation found by ITQ with SimpleTimer("Performing ITQ to find optimal rotation", self._log.info): c, self._r = self._find_itq_rotation(xx, self._itq_iter_num) # De-adjust rotation with PC vector self._r = numpy.dot(pc_top, self._r) if self._rotation_cache_filepath: with SimpleTimer("Saving rotation matrix", self._log.info): file_utils.safe_create_dir(osp.dirname(self._rotation_cache_filepath)) numpy.save(self._rotation_cache_filepath, self._r) # Populating small-code index # - Converting bit-vectors proved faster than creating new codes over # again (~0.01s vs ~0.04s for 80 vectors). with SimpleTimer("Clearing code index", self._log.info): self._code_index.clear() with SimpleTimer("Converting bit-vectors into small codes, inserting " "into code index", self._log.info): self._code_index.add_many_descriptors( (bit_utils.bit_vector_to_int(c[i]), descr_cache[i]) for i in xrange(c.shape[0]) )
def temp_dir(self): return file_utils.safe_create_dir( osp.join(self._work_dir, 'temp_files'))
def work_dir(self): file_utils.safe_create_dir(self._work_dir) return self._work_dir
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # - parallel_map UUIDs to load from the configured index # - classify iterated descriptors uuids_list_filepath = args.uuids_list output_csv_filepath = args.csv_data output_csv_header_filepath = args.csv_header classify_overwrite = config['utility']['classify_overwrite'] p_use_multiprocessing = \ config['utility']['parallel']['use_multiprocessing'] p_index_extraction_cores = \ config['utility']['parallel']['index_extraction_cores'] p_classification_cores = \ config['utility']['parallel']['classification_cores'] if not uuids_list_filepath: raise ValueError("No uuids_list_filepath specified.") elif not os.path.isfile(uuids_list_filepath): raise ValueError("Given uuids_list_filepath did not point to a file.") if output_csv_header_filepath is None: raise ValueError("Need a path to save CSV header labels") if output_csv_filepath is None: raise ValueError("Need a path to save CSV data.") # # Initialize configured plugins # log.info("Initializing descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls()) log.info("Initializing classification factory") c_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory']) log.info("Initializing classifier") #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config(config['plugins']['classifier'], get_classifier_impls()) # # Setup/Process # def iter_uuids(): with open(uuids_list_filepath) as f: for l in f: yield l.strip() def descr_for_uuid(uuid): """ :type uuid: collections.Hashable :rtype: smqtk.representation.DescriptorElement """ return descriptor_index.get_descriptor(uuid) def classify_descr(d): """ :type d: smqtk.representation.DescriptorElement :rtype: smqtk.representation.ClassificationElement """ return classifier.classify(d, c_factory, classify_overwrite) log.info("Initializing uuid-to-descriptor parallel map") #: :type: collections.Iterable[smqtk.representation.DescriptorElement] element_iter = parallel.parallel_map( descr_for_uuid, iter_uuids(), use_multiprocessing=p_use_multiprocessing, cores=p_index_extraction_cores, name="descr_for_uuid", ) log.info("Initializing descriptor-to-classification parallel map") #: :type: collections.Iterable[smqtk.representation.ClassificationElement] classification_iter = parallel.parallel_map( classify_descr, element_iter, use_multiprocessing=p_use_multiprocessing, cores=p_classification_cores, name='classify_descr', ) # # Write/Output files # c_labels = classifier.get_labels() def make_row(c): """ :type c: smqtk.representation.ClassificationElement """ c_m = c.get_classification() return [c.uuid] + [c_m[l] for l in c_labels] # column labels file log.info("Writing CSV column header file: %s", output_csv_header_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath)) with open(output_csv_header_filepath, 'wb') as f_csv: w = csv.writer(f_csv) w.writerow(['uuid'] + c_labels) # CSV file log.info("Writing CSV data file: %s", output_csv_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_filepath)) r_state = [0] * 7 with open(output_csv_filepath, 'wb') as f_csv: w = csv.writer(f_csv) for c in classification_iter: w.writerow(make_row(c)) bin_utils.report_progress(log.info, r_state, 1.0) # Final report r_state[1] -= 1 bin_utils.report_progress(log.info, r_state, 0) log.info("Done")
def codebook_filepath(self): file_utils.safe_create_dir(self._model_dir) return osp.join(self._model_dir, "%s.codebook.npy" % (self.descriptor_type(),))
def _compute_descriptor(self, data): """ Given some kind of data, process and return a feature vector as a Numpy array. :raises RuntimeError: Feature extraction failure of some kind. :param data: Some kind of input data for the feature descriptor. This is descriptor dependent. :type data: smqtk.representation.DataElement :return: Feature vector. This is a histogram of N bins where N is the number of centroids in the codebook. Bin values is percent composition, not absolute counts. :rtype: numpy.ndarray """ super(ColorDescriptor_Base, self)._compute_descriptor(data) checkpoint_filepath = self._get_checkpoint_feature_file(data) # if osp.isfile(checkpoint_filepath): # return numpy.load(checkpoint_filepath) if not self.has_model: raise RuntimeError("No model currently loaded! Check the existence " "or, or generate, model files!\n" "Codebook path: %s\n" "FLANN Index path: %s" % (self.codebook_filepath, self.flann_index_filepath)) self._log.debug("Computing descriptors for data UID[%s]...", data.uuid()) info, descriptors = self._generate_descriptor_matrices({data}) # Load FLANN components pyflann.set_distance_type(self._flann_distance_metric) flann = pyflann.FLANN() flann.load_index(self.flann_index_filepath, self._codebook) if not self._use_sp: ### # Codebook Quantization # # - loaded the model at class initialization if we had one self._log.debug("Quantizing descriptors") try: # If the distance method is HIK, we need to treat it special # since that method produces a similarity score, not a distance # score. # if self._flann_distance_metric == 'hik': # This searches for all NN instead of minimum between n and # the number of descriptors and keeps the last one because # hik is a similarity score and not a distance, which is # also why the values in dists is flipped below. #: :type: (numpy.ndarray, numpy.ndarray) idxs = flann.nn_index(descriptors, self._codebook.shape[0])[0] # Only keep the last index for each descriptor return idxs = numpy.array([i_array[-1] for i_array in idxs]) else: #: :type: (numpy.ndarray, numpy.ndarray) idxs = flann.nn_index(descriptors, 1)[0] except AssertionError: self._log.error("Codebook shape : %s", self._codebook.shape) self._log.error("Descriptor shape: %s", descriptors.shape) raise # Create histogram # - Using explicit bin slots to prevent numpy from automatically # creating tightly constrained bins. This would otherwise cause # histograms between two inputs to be non-comparable (unaligned # bins). # - See numpy note about ``bins`` to understand why the +1 is # necessary # - Learned from spatial implementation that we could feed multiple # neighbors per descriptor into here, leading to a more populated # histogram. # - Could also possibly weight things based on dist from # descriptor? #: :type: numpy.core.multiarray.ndarray h = numpy.histogram(idxs, # indices are all integers bins=numpy.arange(self._codebook.shape[0]+1))[0] # self._log.debug("Quantization histogram: %s", h) # Normalize histogram into relative frequencies # - Not using /= on purpose. h is originally int32 coming out of # histogram. /= would keep int32 type when we want it to be # transformed into a float type by the division. if h.sum(): # noinspection PyAugmentAssignment h = h / float(h.sum()) else: h = numpy.zeros(h.shape, h.dtype) # self._log.debug("Normalized histogram: %s", h) else: ### # Spatial Pyramid Quantization # self._log.debug("Quantizing descriptors using spatial pyramid") ## # Quantization factor - number of nearest codes to be saved q_factor = 10 ## # Concatenating spatial information to descriptor vectors to format: # [ x y <descriptor> ] self._log.debug("Creating combined descriptor matrix") m = numpy.concatenate((info[:, :2], descriptors), axis=1) ## # Creating quantized vectors, consisting vector: # [ x y c_1 ... c_qf dist_1 ... dist_qf ] # which has a total size of 2+(qf*2) # # Sangmin's code included the distances in the quantized vector, but # then also passed this vector into numpy's histogram function with # integral bins, causing the [0,1] to be heavily populated, which # doesn't make sense to do. # idxs, dists = flann.nn_index(m[:, 2:], q_factor) # q = numpy.concatenate([m[:, :2], idxs, dists], axis=1) self._log.debug("Computing nearest neighbors") if self._flann_distance_metric == 'hik': # Query full ordering of code indices idxs = flann.nn_index(m[:, 2:], self._codebook.shape[0])[0] # Extract the right-side block for use in building histogram # Order doesn't actually matter in the current implementation # because index relative position is not being weighted. idxs = idxs[:, -q_factor:] else: idxs = flann.nn_index(m[:, 2:], q_factor)[0] self._log.debug("Creating quantization matrix") # This matrix consists of descriptor (x,y) position + near code # indices. q = numpy.concatenate([m[:, :2], idxs], axis=1) ## # Build spatial pyramid from quantized matrix self._log.debug("Building spatial pyramid histograms") hist_sp = self._build_sp_hist(q, self._codebook.shape[0]) ## # Combine each quadrants into single vector self._log.debug("Combining global+thirds into final histogram.") f = sys.float_info.min # so as we don't div by 0 accidentally def rf_norm(hist): return hist / (float(hist.sum()) + f) h = numpy.concatenate([rf_norm(hist_sp[0]), rf_norm(hist_sp[5]), rf_norm(hist_sp[6]), rf_norm(hist_sp[7])], axis=1) # noinspection PyAugmentAssignment h /= h.sum() self._log.debug("Saving checkpoint feature file") if not osp.isdir(osp.dirname(checkpoint_filepath)): file_utils.safe_create_dir(osp.dirname(checkpoint_filepath)) numpy.save(checkpoint_filepath, h) return h
def main(): description = """ Compute LSH hash codes based on the provided functor on specific descriptors from the configured index given a file-list of UUIDs. When using an input file-list of UUIDs, we require that the UUIDs of indexed descriptors be strings, or equality comparable to the UUIDs' string representation. This script can be used to live update the ``hash2uuid_cache_filepath`` model file for the ``LSHNearestNeighborIndex`` algorithm as output dictionary format is the same as used by that implementation. """ args, config = bin_utils.utility_main_helper(default_config, description, extend_parser) log = logging.getLogger(__name__) # # Load configuration contents # uuid_list_filepath = args.uuids_list hash2uuids_input_filepath = args.input_hash2uuids hash2uuids_output_filepath = args.output_hash2uuids report_interval = config['utility']['report_interval'] use_multiprocessing = config['utility']['use_multiprocessing'] pickle_protocol = config['utility']['pickle_protocol'] # # Checking parameters # if not hash2uuids_output_filepath: raise ValueError("No hash2uuids map output file provided!") # # Loading stuff # log.info("Loading descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls()) log.info("Loading LSH functor") #: :type: smqtk.algorithms.LshFunctor lsh_functor = plugin.from_plugin_config(config['plugins']['lsh_functor'], get_lsh_functor_impls()) def iter_uuids(): if uuid_list_filepath: log.info("Using UUIDs list file") with open(uuid_list_filepath) as f: for l in f: yield l.strip() else: log.info("Using all UUIDs resent in descriptor index") for k in descriptor_index.iterkeys(): yield k # load map if it exists, else start with empty dictionary if hash2uuids_input_filepath and os.path.isfile(hash2uuids_input_filepath): log.info("Loading hash2uuids mapping") with open(hash2uuids_input_filepath) as f: hash2uuids = cPickle.load(f) else: log.info("Creating new hash2uuids mapping for output") hash2uuids = {} # # Compute codes # log.info("Starting hash code computation") compute_hash_codes( uuids_for_processing(iter_uuids(), hash2uuids), descriptor_index, lsh_functor, hash2uuids, report_interval=report_interval, use_mp=use_multiprocessing, ) # # Output results # tmp_output_filepath = hash2uuids_output_filepath + '.WRITING' log.info("Writing hash-to-uuids map to disk: %s", tmp_output_filepath) file_utils.safe_create_dir(os.path.dirname(hash2uuids_output_filepath)) with open(tmp_output_filepath, 'wb') as f: cPickle.dump(hash2uuids, f, pickle_protocol) log.info("Moving on top of input: %s", hash2uuids_output_filepath) os.rename(tmp_output_filepath, hash2uuids_output_filepath) log.info("Done")
def flann_index_filepath(self): file_utils.safe_create_dir(self._model_dir) return osp.join(self._model_dir, "%s.flann_index.dat" % (self.descriptor_type(), ))
def ffmpeg_extract_frame_map(working_dir, video_filepath, second_offset=0, second_interval=0, max_duration=0, frames=(), output_image_ext="png", parallel=None, ffmpeg_exe='ffmpeg'): """ Return a mapping of video frame index to image file in the given output format. If frames requested have not yet been extracted (based on what's contained in the specified output directory), they are done now. This means that this method could take a little time to complete if there are many frames in the video file and this is the first time this is being called. This may return an empty list if there are no frames in the video for the specified, or default, constraints. Extracted frames are cached in a directory structure under the provided ``working_dir`` directory path: ``<working_dir>/VideoFrameExtraction``. Frames are extracted into separate directories based on the SHA1 checksum of the video file. :raises RuntimeError: No frames were extracted. :param working_dir: Working directory for frame extraction to occur in. :type working_dir: str :param video_filepath: Path to the video to extract frames from. :type video_filepath: str :param second_offset: Seconds into the video to start extracting :type second_offset: float :param second_interval: Number of seconds between extracted frames :type second_interval: float :param max_duration: Maximum number of seconds worth of extracted frames (starting from the specified offset). If <=0, we extract until the end of the video. :type max_duration: float :param frames: Specific exact frame numbers within the video to extract. Providing explicit frames causes offset, interval and duration parameters to be ignored and only the frames specified here to be extracted and returned. :type frames: collections.Iterable[int] :param output_image_ext: Extension to use for output images. :type output_image_ext: str :param parallel: Number of processes to use for frame extraction. This is None by default, meaning that all available cores/threads are used. :type parallel: int or None :param ffmpeg_exe: ffmpeg executable to use for frame extraction. By default, we attempt to use what is available of the PATH. :type ffmpeg_exe: str or unicode :return: Map of frame-to-filepath for requested video frames :rtype: dict[int, str] """ log = logging.getLogger('smqtk.utils.video_utils.extract_frame_map') video_md = get_metadata_info(video_filepath) video_sha1sum = hashlib.sha1(open(video_filepath, 'rb').read()).hexdigest() frame_output_dir = os.path.join(working_dir, "VideoFrameExtraction", *string_utils.partition_string( video_sha1sum, 10) # 40 hex chars split into chunks of 4 ) file_utils.safe_create_dir(frame_output_dir) def filename_for_frame(frame, ext): """ method standard filename for a given frame file """ return "%08d.%s" % (frame, ext.lstrip('.')) def iter_frames_for_interval(): """ Return a generator expression yielding frame numbers from the input video that match the given query parameters. Indices returned are 0-based (i.e. first frame is 0, not 1). We are making a sensible assumption that we are not dealing with frame speeds of over 1000Hz and rounding frame frame times to the neared thousandth of a second to mitigate floating point error. :rtype: list of int """ _log = logging.getLogger('smqtk.utils.video_utils.extract_frame_map' '.iter_frames_for_interval') num_frames = int(video_md.fps * video_md.duration) first_frame = second_offset * video_md.fps _log.debug("First frame: %f", first_frame) if max_duration > 0: cutoff_frame = min(float(num_frames), (max_duration + second_offset) * video_md.fps) else: cutoff_frame = float(num_frames) _log.debug("Cutoff frame: %f", cutoff_frame) if second_interval: incr = second_interval * video_md.fps else: incr = 1.0 _log.debug("Frame increment: %f", incr) # Interpolate yield first_frame next_frm = first_frame + incr while next_frm < cutoff_frame: _log.debug("-- adding frame: %f", next_frm) yield int(next_frm) next_frm += incr # noinspection PyShadowingNames def extract_frames(frames_to_process): """ Extract specific frames from the input video file using ffmpeg. If not all frames could be extracted, we return what we were able to extract. :param frames_to_process: Mapping of frame-number:filepath pairs to extract from the input video. :type frames_to_process: dict[int,str or unicode] :return: List of frames that were successfully extracted. :rtype: list[int] """ _log = logging.getLogger('smqtk.utils.video_utils.extract_frame_map' '.extract_frames') # Setup temp extraction directory tmp_extraction_dir = os.path.join(frame_output_dir, ".TMP") if os.path.isdir(tmp_extraction_dir): _log.debug("Existing temp director found, removing and starting " "over") shutil.rmtree(tmp_extraction_dir, ignore_errors=True) os.makedirs(tmp_extraction_dir) p = multiprocessing.Pool(parallel) # Mapping of frame to (result, output_filepath) #: :type: dict[int, (AsyncResult, str)] rmap = {} for f, ofp in six.iteritems(frames_to_process): tfp = os.path.join(tmp_extraction_dir, filename_for_frame(f, output_image_ext)) t = f / video_md.fps rmap[f] = (p.apply_async(ffmpeg_extract_frame, args=(t, video_filepath, tfp, ffmpeg_exe)), tfp) p.close() # Check for failures extracted_frames = [] for f, ofp in six.iteritems(frames_to_process): r, tfp = rmap[f] r.get() # wait for finish if not os.path.isfile(tfp): _log.warn("Failed to generated file for frame %d", f) else: extracted_frames.append(f) os.rename(tfp, ofp) p.join() del p os.removedirs(tmp_extraction_dir) _log.debug("Frame extraction complete") return extracted_frames # Determine frames to extract from video extract_indices = set() if frames: log.debug("Only extracting specified frames: %s", frames) extract_indices.update(frames) else: log.debug( "Determining frames needed for specification: " "offset: %f, interval: %f, max_duration: %f", second_offset, second_interval, max_duration) extract_indices.update(iter_frames_for_interval()) if not extract_indices: return {} # frame/filename map that will be returned based on requested frames frame_map = dict((i, os.path.join(frame_output_dir, filename_for_frame(i, output_image_ext))) for i in extract_indices) ### # Acquire a file-base lock in output directory so that we don't conflict # with another process extracting frames to the same directory. # # NOTE: This method is prone to starvation if many processes are trying # to extract to the same video frames, but not yet probably due to # existing use cases. # lock_file = os.path.join(frame_output_dir, '.lock') log.debug("Acquiring file lock in '%s'...", frame_output_dir) while not file_utils.exclusive_touch(lock_file): # This is sufficiently small to be fine grained, but not so small to # burn the CPU. time.sleep(0.01) log.debug("Acquiring file lock -> Acquired!") try: ### # Determine frames to actually extract base on existing files (if any) # #: :type: dict[int, str] frames_to_process = {} existing_frames = [] for i, img_file in sorted(frame_map.items()): if not os.path.isfile(img_file): log.debug('frame %d needs processing', i) frames_to_process[i] = img_file else: existing_frames.append(i) ### # Extract needed frames via hook function that provides # implementation. # if frames_to_process: frames_extracted = extract_frames(frames_to_process) if (len(existing_frames) + len(frames_extracted)) == 0: raise RuntimeError("Failed to extract any frames for video") return frame_map finally: os.remove(lock_file)
def flann_params_filepath(self): file_utils.safe_create_dir(self._model_dir) return osp.join(self._model_dir, "%s.flann_params.json" % (self.descriptor_type(), ))
except requests.HTTPError, ex: log.warn("Skipping '%s': %s (code=%s)", url, r.reason, r.status_code) return None, None, None content = StringIO.StringIO() for c in r.iter_content(1024): content.write(c) cont_type = tika_detector.from_buffer(content.getvalue()) ext = mimetypes.guess_extension(cont_type) if not ext: log.warn("Skipping '%s': Bad content type '%s'", url, cont_type) return None, None, None segs = url.split('/') dirpath = os.path.join(output_dir, *segs[2:-1]) safe_create_dir(dirpath) basename = os.path.splitext(segs[-1])[0] save_pth = os.path.join(dirpath, basename + ext) if not os.path.isfile(save_pth): sha1_checksum = hashlib.sha1(content.getvalue()).hexdigest() tmp_pth = '.'.join([save_pth, uuid.uuid4().hex]) with open(tmp_pth, 'wb') as f: f.write(content.getvalue()) os.rename(tmp_pth, save_pth) log.info("Downloaded '%s' -> '%s'", url, save_pth) else: log.info("Already downloaded: '%s' -> '%s'", url, save_pth) with open(save_pth) as f: sha1_checksum = hashlib.sha1(f.read()).hexdigest()
def _compute_descriptor(self, data): """ Given some kind of data, process and return a feature vector as a Numpy array. :raises RuntimeError: Feature extraction failure of some kind. :param data: Some kind of input data for the feature descriptor. This is descriptor dependent. :type data: smqtk.representation.DataElement :return: Feature vector. This is a histogram of N bins where N is the number of centroids in the codebook. Bin values is percent composition, not absolute counts. :rtype: numpy.ndarray """ super(ColorDescriptor_Base, self)._compute_descriptor(data) checkpoint_filepath = self._get_checkpoint_feature_file(data) # if osp.isfile(checkpoint_filepath): # return numpy.load(checkpoint_filepath) if not self.has_model: raise RuntimeError( "No model currently loaded! Check the existence " "or, or generate, model files!\n" "Codebook path: %s\n" "FLANN Index path: %s" % (self.codebook_filepath, self.flann_index_filepath)) self._log.debug("Computing descriptors for data UID[%s]...", data.uuid()) info, descriptors = self._generate_descriptor_matrices({data}) # Load FLANN components pyflann.set_distance_type(self._flann_distance_metric) flann = pyflann.FLANN() flann.load_index(self.flann_index_filepath, self._codebook) if not self._use_sp: ### # Codebook Quantization # # - loaded the model at class initialization if we had one self._log.debug("Quantizing descriptors") try: # If the distance method is HIK, we need to treat it special # since that method produces a similarity score, not a distance # score. # if self._flann_distance_metric == 'hik': # This searches for all NN instead of minimum between n and # the number of descriptors and keeps the last one because # hik is a similarity score and not a distance, which is # also why the values in dists is flipped below. #: :type: numpy.core.multiarray.ndarray, numpy.core.multiarray.ndarray idxs = flann.nn_index(descriptors, self._codebook.shape[0])[0] # Only keep the last index for each descriptor return idxs = numpy.array([i_array[-1] for i_array in idxs]) else: # :type: numpy.core.multiarray.ndarray, numpy.core.multiarray.ndarray idxs = flann.nn_index(descriptors, 1)[0] except AssertionError: self._log.error("Codebook shape : %s", self._codebook.shape) self._log.error("Descriptor shape: %s", descriptors.shape) raise # Create histogram # - Using explicit bin slots to prevent numpy from automatically # creating tightly constrained bins. This would otherwise cause # histograms between two inputs to be non-comparable (unaligned # bins). # - See numpy note about ``bins`` to understand why the +1 is # necessary # - Learned from spatial implementation that we could feed multiple # neighbors per descriptor into here, leading to a more populated # histogram. # - Could also possibly weight things based on dist from # descriptor? #: :type: numpy.core.multiarray.ndarray h = numpy.histogram( idxs, # indices are all integers bins=numpy.arange(self._codebook.shape[0] + 1))[0] # self._log.debug("Quantization histogram: %s", h) # Normalize histogram into relative frequencies # - Not using /= on purpose. h is originally int32 coming out of # histogram. /= would keep int32 type when we want it to be # transformed into a float type by the division. if h.sum(): # noinspection PyAugmentAssignment h = h / float(h.sum()) else: h = numpy.zeros(h.shape, h.dtype) # self._log.debug("Normalized histogram: %s", h) else: ### # Spatial Pyramid Quantization # self._log.debug("Quantizing descriptors using spatial pyramid") ## # Quantization factor - number of nearest codes to be saved q_factor = 10 ## # Concatenating spatial information to descriptor vectors to format: # [ x y <descriptor> ] self._log.debug("Creating combined descriptor matrix") m = numpy.concatenate((info[:, :2], descriptors), axis=1) ## # Creating quantized vectors, consisting vector: # [ x y c_1 ... c_qf dist_1 ... dist_qf ] # which has a total size of 2+(qf*2) # # Sangmin's code included the distances in the quantized vector, but # then also passed this vector into numpy's histogram function with # integral bins, causing the [0,1] to be heavily populated, which # doesn't make sense to do. # idxs, dists = flann.nn_index(m[:, 2:], q_factor) # q = numpy.concatenate([m[:, :2], idxs, dists], axis=1) self._log.debug("Computing nearest neighbors") if self._flann_distance_metric == 'hik': # Query full ordering of code indices idxs = flann.nn_index(m[:, 2:], self._codebook.shape[0])[0] # Extract the right-side block for use in building histogram # Order doesn't actually matter in the current implementation # because index relative position is not being weighted. idxs = idxs[:, -q_factor:] else: idxs = flann.nn_index(m[:, 2:], q_factor)[0] self._log.debug("Creating quantization matrix") # This matrix consists of descriptor (x,y) position + near code # indices. q = numpy.concatenate([m[:, :2], idxs], axis=1) ## # Build spatial pyramid from quantized matrix self._log.debug("Building spatial pyramid histograms") hist_sp = self._build_sp_hist(q, self._codebook.shape[0]) ## # Combine each quadrants into single vector self._log.debug("Combining global+thirds into final histogram.") f = sys.float_info.min # so as we don't div by 0 accidentally def rf_norm(hist): return hist / (float(hist.sum()) + f) h = numpy.concatenate([ rf_norm(hist_sp[0]), rf_norm(hist_sp[5]), rf_norm(hist_sp[6]), rf_norm(hist_sp[7]) ], axis=1) # noinspection PyAugmentAssignment h /= h.sum() self._log.debug("Saving checkpoint feature file") if not osp.isdir(osp.dirname(checkpoint_filepath)): file_utils.safe_create_dir(osp.dirname(checkpoint_filepath)) numpy.save(checkpoint_filepath, h) return h
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # - parallel_map UUIDs to load from the configured index # - classify iterated descriptors uuids_list_filepath = args.uuids_list output_csv_filepath = args.csv_data output_csv_header_filepath = args.csv_header classify_overwrite = config['utility']['classify_overwrite'] p_use_multiprocessing = \ config['utility']['parallel']['use_multiprocessing'] p_index_extraction_cores = \ config['utility']['parallel']['index_extraction_cores'] p_classification_cores = \ config['utility']['parallel']['classification_cores'] if not uuids_list_filepath: raise ValueError("No uuids_list_filepath specified.") elif not os.path.isfile(uuids_list_filepath): raise ValueError("Given uuids_list_filepath did not point to a file.") if output_csv_header_filepath is None: raise ValueError("Need a path to save CSV header labels") if output_csv_filepath is None: raise ValueError("Need a path to save CSV data.") # # Initialize configured plugins # log.info("Initializing descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls() ) log.info("Initializing classification factory") c_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory'] ) log.info("Initializing classifier") #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config( config['plugins']['classifier'], get_classifier_impls() ) # # Setup/Process # def iter_uuids(): with open(uuids_list_filepath) as f: for l in f: yield l.strip() def descr_for_uuid(uuid): """ :type uuid: collections.Hashable :rtype: smqtk.representation.DescriptorElement """ return descriptor_index.get_descriptor(uuid) def classify_descr(d): """ :type d: smqtk.representation.DescriptorElement :rtype: smqtk.representation.ClassificationElement """ return classifier.classify(d, c_factory, classify_overwrite) log.info("Initializing uuid-to-descriptor parallel map") #: :type: collections.Iterable[smqtk.representation.DescriptorElement] element_iter = parallel.parallel_map( descr_for_uuid, iter_uuids(), use_multiprocessing=p_use_multiprocessing, cores=p_index_extraction_cores, name="descr_for_uuid", ) log.info("Initializing descriptor-to-classification parallel map") #: :type: collections.Iterable[smqtk.representation.ClassificationElement] classification_iter = parallel.parallel_map( classify_descr, element_iter, use_multiprocessing=p_use_multiprocessing, cores=p_classification_cores, name='classify_descr', ) # # Write/Output files # c_labels = classifier.get_labels() def make_row(e): """ :type e: smqtk.representation.ClassificationElement """ c_m = e.get_classification() return [e.uuid] + [c_m[l] for l in c_labels] # column labels file log.info("Writing CSV column header file: %s", output_csv_header_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath)) with open(output_csv_header_filepath, 'wb') as f_csv: w = csv.writer(f_csv) w.writerow(['uuid'] + [str(cl) for cl in c_labels]) # CSV file log.info("Writing CSV data file: %s", output_csv_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_filepath)) r_state = [0] * 7 with open(output_csv_filepath, 'wb') as f_csv: w = csv.writer(f_csv) for c in classification_iter: w.writerow(make_row(c)) bin_utils.report_progress(log.info, r_state, 1.0) # Final report r_state[1] -= 1 bin_utils.report_progress(log.info, r_state, 0) log.info("Done")
def make_curves(log, skl_curve_func, title_hook, x_label, y_label, fold_data, output_dir, plot_prefix, show): """ Generic method for PR/ROC curve generation fold data format: { 0: { '<label>': { "truth": [...], # Parallel truth and classification "proba": [...], # Parallel probability values }, ... }, ... } :param log: Logger instance to use. :param skl_curve_func: scikit-learn curve generation function. This should be wrapped to return (x, y) value arrays. :param title_hook: Descriptive label of the classifier. :param x_label: X-axis label :param y_label: Y-axis label :param fold_data: Truth and classification probability results for test data per fold. See above for format. :param output_dir: Directory to output plot images to. :param plot_prefix: String prefix for output files. :param show: Show the output plots interactively or not. """ file_utils.safe_create_dir(output_dir) log.info("Generating %s curves for per-folds and overall", title_hook) # All class labels encountered class_labels = set() # Make curves for classes per fold. One line per class for i in fold_data: log.info("-- Fold %i", i) plt.clf() for label in fold_data[i]: log.info(" -- label '%s'", label) class_labels.add(label) l_truth = fold_data[i][label]['truth'] l_proba = fold_data[i][label]['proba'] x, y = skl_curve_func(l_truth, l_proba) auc = sklearn.metrics.auc(x, y) plt.plot(x, y, label="class '%s' (auc=%f)" % (label, auc)) format_plt("Classifier %s - Fold %d" % (title_hook, i), x_label, y_label) filename = plot_prefix + 'fold_%d.png' % i save_plt(output_dir, filename, show) # Plot aggregate performance curve per class log.info("-- All folds") plt.clf() for label in sorted(class_labels): l_truth = [t for i in fold_data for t in fold_data[i][label]['truth']] l_proba = [p for i in fold_data for p in fold_data[i][label]['proba']] x, y = skl_curve_func(l_truth, l_proba) auc = sklearn.metrics.auc(x, y) plt.plot(x, y, label="agg '%s' (auc=%f)" % (label, auc)) format_plt("Classifier %s - Fold Summary" % title_hook, x_label, y_label) filename = plot_prefix + "validation.png" save_plt(output_dir, filename, show)
def ffmpeg_extract_frame_map( working_dir, video_filepath, second_offset=0, second_interval=0, max_duration=0, frames=(), output_image_ext="png", parallel=None, ffmpeg_exe="ffmpeg", ): """ Return a mapping of video frame index to image file in the given output format. If frames requested have not yet been extracted (based on what's contained in the specified output directory), they are done now. This means that this method could take a little time to complete if there are many frames in the video file and this is the first time this is being called. This may return an empty list if there are no frames in the video for the specified, or default, constraints. Extracted frames are cached in a directory structure under the provided ``working_dir`` directory path: ``<working_dir>/VideoFrameExtraction``. Frames are extracted into separate directories based on the SHA1 checksum of the video file. :raises RuntimeError: No frames were extracted. :param working_dir: Working directory for frame extraction to occur in. :type working_dir: str :param video_filepath: Path to the video to extract frames from. :type video_filepath: str :param second_offset: Seconds into the video to start extracting :type second_offset: float :param second_interval: Number of seconds between extracted frames :type second_interval: float :param max_duration: Maximum number of seconds worth of extracted frames (starting from the specified offset). If <=0, we extract until the end of the video. :type max_duration: float :param frames: Specific exact frame numbers within the video to extract. Providing explicit frames causes offset, interval and duration parameters to be ignored and only the frames specified here to be extracted and returned. :type frames: collections.Iterable[int] :param parallel: Number of processes to use for frame extraction. This is None by default, meaning that all available cores/threads are used. :type parallel: int or None :param ffmpeg_exe: ffmpeg executable to use for frame extraction. By default, we attempt to use what is available of the PATH. :type ffmpeg_exe: str or unicode :return: Map of frame-to-filepath for requested video frames :rtype: dict of (int, str) """ log = logging.getLogger("smqtk.utils.video_utils.extract_frame_map") video_md = get_metadata_info(video_filepath) video_sha1sum = hashlib.sha1(open(video_filepath, "rb").read()).hexdigest() frame_output_dir = os.path.join( working_dir, "VideoFrameExtraction", *string_utils.partition_string(video_sha1sum, 10) # 40 hex chars split into chunks of 4 ) file_utils.safe_create_dir(frame_output_dir) def filename_for_frame(frame, ext): """ method standard filename for a given frame file """ return "%08d.%s" % (frame, ext.lstrip(".")) def iter_frames_for_interval(): """ Return a generator expression yielding frame numbers from the input video that match the given query parameters. Indices returned are 0-based (i.e. first frame is 0, not 1). We are making a sensible assumption that we are not dealing with frame speeds of over 1000Hz and rounding frame frame times to the neared thousandth of a second to mitigate floating point error. :rtype: list of int """ _log = logging.getLogger("smqtk.utils.video_utils.extract_frame_map" ".iter_frames_for_interval") num_frames = int(video_md.fps * video_md.duration) first_frame = second_offset * video_md.fps _log.debug("First frame: %f", first_frame) if max_duration > 0: cutoff_frame = min(float(num_frames), (max_duration + second_offset) * video_md.fps) else: cutoff_frame = float(num_frames) _log.debug("Cutoff frame: %f", cutoff_frame) if second_interval: incr = second_interval * video_md.fps else: incr = 1.0 _log.debug("Frame increment: %f", incr) # Interpolate yield first_frame next_frm = first_frame + incr while next_frm < cutoff_frame: _log.debug("-- adding frame: %f", next_frm) yield int(next_frm) next_frm += incr def extract_frames(frames_to_process): """ Extract specific frames from the input video file using ffmpeg. If not all frames could be extracted, we return what we were able to extract. :param frames_to_process: Mapping of frame-number:filepath pairs to extract from the input video. :type frames_to_process: dict[int,str or unicode] :return: List of frames that were successfully extracted. :rtype: list[int] """ _log = logging.getLogger("smqtk.utils.video_utils.extract_frame_map" ".extract_frames") # Setup temp extraction directory tmp_extraction_dir = os.path.join(frame_output_dir, ".TMP") if os.path.isdir(tmp_extraction_dir): _log.debug("Existing temp director found, removing and starting " "over") shutil.rmtree(tmp_extraction_dir, ignore_errors=True) os.makedirs(tmp_extraction_dir) p = multiprocessing.Pool(parallel) # Mapping of frame to (result, output_filepath) #: :type: dict of (int, (AsyncResult, str)) rmap = {} for f, ofp in frames_to_process.iteritems(): tfp = os.path.join(tmp_extraction_dir, filename_for_frame(f, output_image_ext)) t = f / video_md.fps rmap[f] = (p.apply_async(ffmpeg_extract_frame, args=(t, video_filepath, tfp, ffmpeg_exe)), tfp) p.close() # Check for failures extracted_frames = [] for f, ofp in frames_to_process.iteritems(): r, tfp = rmap[f] r.get() # wait for finish if not os.path.isfile(tfp): _log.warn("Failed to generated file for frame %d", f) else: extracted_frames.append(f) os.rename(tfp, ofp) p.join() del p os.removedirs(tmp_extraction_dir) _log.debug("Frame extraction complete") return extracted_frames # Determine frames to extract from video extract_indices = set() if frames: log.debug("Only extracting specified frames: %s", frames) extract_indices.update(frames) else: log.debug( "Determining frames needed for specification: " "offset: %f, interval: %f, max_duration: %f", second_offset, second_interval, max_duration, ) extract_indices.update(iter_frames_for_interval()) if not extract_indices: return {} # frame/filename map that will be returned based on requested frames frame_map = dict( (i, os.path.join(frame_output_dir, filename_for_frame(i, output_image_ext))) for i in extract_indices ) ### # Acquire a file-base lock in output directory so that we don't conflict # with another process extracting frames to the same directory. # # NOTE: This method is prone to starvation if many processes are trying # to extract to the same video frames, but not yet probably due to # existing use cases. # lock_file = os.path.join(frame_output_dir, ".lock") log.debug("Acquiring file lock in '%s'...", frame_output_dir) while not file_utils.exclusive_touch(lock_file): # This is sufficiently small to be fine grained, but not so small to # burn the CPU. time.sleep(0.01) log.debug("Acquiring file lock -> Acquired!") try: ### # Determine frames to actually extract base on existing files (if any) # #: :type: dict[int, str] frames_to_process = {} existing_frames = [] for i, img_file in sorted(frame_map.items()): if not os.path.isfile(img_file): log.debug("frame %d needs processing", i) frames_to_process[i] = img_file else: existing_frames.append(i) ### # Extract needed frames via hook function that provides # implementation. # if frames_to_process: frames_extracted = extract_frames(frames_to_process) if (len(existing_frames) + len(frames_extracted)) == 0: raise RuntimeError("Failed to extract any frames for video") return frame_map finally: os.remove(lock_file)
def make_curves(log, skl_curve_func, title_hook, x_label, y_label, fold_data, output_dir, plot_prefix, show): """ Generic method for PR/ROC curve generation :param skl_curve_func: scikit-learn curve generation function. This should be wrapped to return (x, y) value arrays. """ file_utils.safe_create_dir(output_dir) log.info("Generating %s curves for per-folds and overall", title_hook) # in-order list of fold (x, y) value lists fold_xy = [] fold_auc = [] # all truth and proba pairs g_truth = [] g_proba = [] for i in fold_data: log.info("-- Fold %i", i) f_truth = [] f_proba = [] plt.clf() for label in fold_data[i]: log.info(" -- label '%s'", label) l_truth = fold_data[i][label]['truth'] l_proba = fold_data[i][label]['proba'] x, y = skl_curve_func(l_truth, l_proba) auc = sklearn.metrics.auc(x, y) plt.plot(x, y, label="class '%s' (auc=%f)" % (label, auc)) f_truth.extend(l_truth) f_proba.extend(l_proba) # Plot for fold x, y = skl_curve_func(f_truth, f_proba) auc = sklearn.metrics.auc(x, y) plt.plot(x, y, label="Fold (auc=%f)" % auc) format_plt("Classifier %s - Fold %d" % (title_hook, i), x_label, y_label) filename = plot_prefix + 'fold_%d.png' % i save_plt(output_dir, filename, show) fold_xy.append([x, y]) fold_auc.append(auc) g_truth.extend(f_truth) g_proba.extend(f_proba) # Plot global curve log.info("-- All folds") plt.clf() for i in fold_data: plt.plot(fold_xy[i][0], fold_xy[i][1], label="Fold %d (auc=%f)" % (i, fold_auc[i])) x, y = skl_curve_func(g_truth, g_proba) auc = sklearn.metrics.auc(x, y) plt.plot(x, y, label="All (auc=%f)" % auc) format_plt("Classifier %s - Validation" % title_hook, x_label, y_label) filename = plot_prefix + "validation.png" save_plt(output_dir, filename, show)
def flann_index_filepath(self): file_utils.safe_create_dir(self._model_dir) return osp.join(self._model_dir, "%s.flann_index.dat" % (self.descriptor_type(),))
def flann_params_filepath(self): file_utils.safe_create_dir(self._model_dir) return osp.join(self._model_dir, "%s.flann_params.json" % (self.descriptor_type(),))