def compute_hash_codes(task, folderId, **kwargs): """ Celery task for computing hash codes on a given folder (descriptor index). :param task: Celery provided task object. :param folderId: The folder to train ITQ for, note this is only used to infer the descriptor index. """ task.job_manager.updateProgress(message='Computing Hash Codes', forceFlush=True) index = descriptorIndexFromFolderId(task.girder_client, folderId) smqtkFolder = getCreateFolder(task.girder_client, folderId, '.smqtk') meanVecFileId = smqtkFileIdFromName(task.girder_client, smqtkFolder, 'mean_vec.npy') rotationFileId = smqtkFileIdFromName(task.girder_client, smqtkFolder, 'rotation.npy') hash2uuidsFile = initializeItemWithFile(task.girder_client, createOverwriteItem(task.girder_client, smqtkFolder['_id'], 'hash2uuids.pickle')) functor = ItqFunctor(mean_vec_cache=GirderDataElement(meanVecFileId, api_root=task.request.apiUrl, token=task.request.jobInfoSpec['headers']['Girder-Token']), rotation_cache=GirderDataElement(rotationFileId, api_root=task.request.apiUrl, token=task.request.jobInfoSpec['headers']['Girder-Token'])) hash2uuids = compute_functions.compute_hash_codes(index.iterkeys(), index, functor, use_mp=False) data = pickle.dumps(dict((y, x) for (x, y) in hash2uuids)) task.girder_client.uploadFileContents(hash2uuidsFile['_id'], six.BytesIO(data), len(data))
def compute_hash_codes(task, folderId, **kwargs): """ Celery task for computing hash codes on a given folder (descriptor index). :param task: Celery provided task object. :param folderId: The folder to train ITQ for, note this is only used to infer the descriptor index. """ task.job_manager.updateProgress(message='Computing Hash Codes', forceFlush=True) index = descriptorIndexFromFolderId(task.girder_client, folderId) smqtkFolder = getCreateFolder(task.girder_client, folderId, '.smqtk') meanVecFileId = smqtkFileIdFromName(task.girder_client, smqtkFolder, 'mean_vec.npy') rotationFileId = smqtkFileIdFromName(task.girder_client, smqtkFolder, 'rotation.npy') hash2uuidsFile = initializeItemWithFile( task.girder_client, createOverwriteItem(task.girder_client, smqtkFolder['_id'], 'hash2uuids.pickle')) functor = ItqFunctor( mean_vec_cache=GirderDataElement( meanVecFileId, api_root=task.request.apiUrl, token=task.request.jobInfoSpec['headers']['Girder-Token']), rotation_cache=GirderDataElement( rotationFileId, api_root=task.request.apiUrl, token=task.request.jobInfoSpec['headers']['Girder-Token'])) hash2uuids = compute_functions.compute_hash_codes(index.iterkeys(), index, functor, use_mp=False) data = pickle.dumps(dict((y, x) for (x, y) in hash2uuids)) task.girder_client.uploadFileContents(hash2uuidsFile['_id'], six.BytesIO(data), len(data))
def main(): args = cli_parser().parse_args() config = cli.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # # Load configuration contents # uuid_list_filepath = args.uuids_list report_interval = config['utility']['report_interval'] use_multiprocessing = config['utility']['use_multiprocessing'] # # Checking input parameters # if (uuid_list_filepath is not None) and \ not os.path.isfile(uuid_list_filepath): raise ValueError("UUIDs list file does not exist!") # # Loading stuff # log.info("Loading descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = from_config_dict(config['plugins']['descriptor_index'], DescriptorIndex.get_impls()) log.info("Loading LSH functor") #: :type: smqtk.algorithms.LshFunctor lsh_functor = from_config_dict(config['plugins']['lsh_functor'], LshFunctor.get_impls()) log.info("Loading Key/Value store") #: :type: smqtk.representation.KeyValueStore hash2uuids_kvstore = from_config_dict( config['plugins']['hash2uuid_kvstore'], KeyValueStore.get_impls()) # Iterate either over what's in the file given, or everything in the # configured index. def iter_uuids(): if uuid_list_filepath: log.info("Using UUIDs list file") with open(uuid_list_filepath) as f: for l in f: yield l.strip() else: log.info("Using all UUIDs resent in descriptor index") for k in descriptor_index.keys(): yield k # # Compute codes # log.info("Starting hash code computation") kv_update = {} for uuid, hash_int in \ compute_hash_codes(uuids_for_processing(iter_uuids(), hash2uuids_kvstore), descriptor_index, lsh_functor, report_interval, use_multiprocessing, True): # Get original value in KV-store if not in update dict. if hash_int not in kv_update: kv_update[hash_int] = hash2uuids_kvstore.get(hash_int, set()) kv_update[hash_int] |= {uuid} if kv_update: log.info("Updating KV store... (%d keys)" % len(kv_update)) hash2uuids_kvstore.add_many(kv_update) log.info("Done")
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # # Load configuration contents # uuid_list_filepath = args.uuids_list report_interval = config['utility']['report_interval'] use_multiprocessing = config['utility']['use_multiprocessing'] # # Checking input parameters # if (uuid_list_filepath is not None) and \ not os.path.isfile(uuid_list_filepath): raise ValueError("UUIDs list file does not exist!") # # Loading stuff # log.info("Loading descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls() ) log.info("Loading LSH functor") #: :type: smqtk.algorithms.LshFunctor lsh_functor = plugin.from_plugin_config( config['plugins']['lsh_functor'], get_lsh_functor_impls() ) log.info("Loading Key/Value store") #: :type: smqtk.representation.KeyValueStore hash2uuids_kvstore = plugin.from_plugin_config( config['plugins']['hash2uuid_kvstore'], get_key_value_store_impls() ) # Iterate either over what's in the file given, or everything in the # configured index. def iter_uuids(): if uuid_list_filepath: log.info("Using UUIDs list file") with open(uuid_list_filepath) as f: for l in f: yield l.strip() else: log.info("Using all UUIDs resent in descriptor index") for k in descriptor_index.keys(): yield k # # Compute codes # log.info("Starting hash code computation") kv_update = {} for uuid, hash_int in \ compute_hash_codes(uuids_for_processing(iter_uuids(), hash2uuids_kvstore), descriptor_index, lsh_functor, report_interval, use_multiprocessing, True): # Get original value in KV-store if not in update dict. if hash_int not in kv_update: kv_update[hash_int] = hash2uuids_kvstore.get(hash_int, set()) kv_update[hash_int] |= {uuid} if kv_update: log.info("Updating KV store... (%d keys)" % len(kv_update)) hash2uuids_kvstore.add_many(kv_update) log.info("Done")
def main(): description = """ Compute LSH hash codes based on the provided functor on specific descriptors from the configured index given a file-list of UUIDs. When using an input file-list of UUIDs, we require that the UUIDs of indexed descriptors be strings, or equality comparable to the UUIDs' string representation. This script can be used to live update the ``hash2uuid_cache_filepath`` model file for the ``LSHNearestNeighborIndex`` algorithm as output dictionary format is the same as used by that implementation. """ args, config = bin_utils.utility_main_helper(default_config, description, extend_parser) log = logging.getLogger(__name__) # # Load configuration contents # uuid_list_filepath = args.uuids_list hash2uuids_input_filepath = args.input_hash2uuids hash2uuids_output_filepath = args.output_hash2uuids report_interval = config['utility']['report_interval'] use_multiprocessing = config['utility']['use_multiprocessing'] pickle_protocol = config['utility']['pickle_protocol'] # # Checking parameters # if not hash2uuids_output_filepath: raise ValueError("No hash2uuids map output file provided!") # # Loading stuff # log.info("Loading descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls() ) log.info("Loading LSH functor") #: :type: smqtk.algorithms.LshFunctor lsh_functor = plugin.from_plugin_config( config['plugins']['lsh_functor'], get_lsh_functor_impls() ) def iter_uuids(): if uuid_list_filepath: log.info("Using UUIDs list file") with open(uuid_list_filepath) as f: for l in f: yield l.strip() else: log.info("Using all UUIDs resent in descriptor index") for k in descriptor_index.iterkeys(): yield k # load map if it exists, else start with empty dictionary if hash2uuids_input_filepath and os.path.isfile(hash2uuids_input_filepath): log.info("Loading hash2uuids mapping") with open(hash2uuids_input_filepath) as f: hash2uuids = cPickle.load(f) else: log.info("Creating new hash2uuids mapping for output") hash2uuids = {} # # Compute codes # log.info("Starting hash code computation") compute_hash_codes( uuids_for_processing(iter_uuids(), hash2uuids), descriptor_index, lsh_functor, hash2uuids, report_interval=report_interval, use_mp=use_multiprocessing, ) # # Output results # tmp_output_filepath = hash2uuids_output_filepath + '.WRITING' log.info("Writing hash-to-uuids map to disk: %s", tmp_output_filepath) file_utils.safe_create_dir(os.path.dirname(hash2uuids_output_filepath)) with open(tmp_output_filepath, 'wb') as f: cPickle.dump(hash2uuids, f, pickle_protocol) log.info("Moving on top of input: %s", hash2uuids_output_filepath) os.rename(tmp_output_filepath, hash2uuids_output_filepath) log.info("Done")
def main(): description = """ Compute LSH hash codes based on the provided functor on specific descriptors from the configured index given a file-list of UUIDs. When using an input file-list of UUIDs, we require that the UUIDs of indexed descriptors be strings, or equality comparable to the UUIDs' string representation. This script can be used to live update the ``hash2uuid_cache_filepath`` model file for the ``LSHNearestNeighborIndex`` algorithm as output dictionary format is the same as used by that implementation. """ args, config = bin_utils.utility_main_helper(default_config, description, extend_parser) log = logging.getLogger(__name__) # # Load configuration contents # uuid_list_filepath = args.uuids_list hash2uuids_input_filepath = args.input_hash2uuids hash2uuids_output_filepath = args.output_hash2uuids report_interval = config['utility']['report_interval'] use_multiprocessing = config['utility']['use_multiprocessing'] pickle_protocol = config['utility']['pickle_protocol'] # # Checking parameters # if not hash2uuids_output_filepath: raise ValueError("No hash2uuids map output file provided!") # # Loading stuff # log.info("Loading descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls()) log.info("Loading LSH functor") #: :type: smqtk.algorithms.LshFunctor lsh_functor = plugin.from_plugin_config(config['plugins']['lsh_functor'], get_lsh_functor_impls()) def iter_uuids(): if uuid_list_filepath: log.info("Using UUIDs list file") with open(uuid_list_filepath) as f: for l in f: yield l.strip() else: log.info("Using all UUIDs resent in descriptor index") for k in descriptor_index.iterkeys(): yield k # load map if it exists, else start with empty dictionary if hash2uuids_input_filepath and os.path.isfile(hash2uuids_input_filepath): log.info("Loading hash2uuids mapping") with open(hash2uuids_input_filepath) as f: hash2uuids = cPickle.load(f) else: log.info("Creating new hash2uuids mapping for output") hash2uuids = {} # # Compute codes # log.info("Starting hash code computation") compute_hash_codes( uuids_for_processing(iter_uuids(), hash2uuids), descriptor_index, lsh_functor, hash2uuids, report_interval=report_interval, use_mp=use_multiprocessing, ) # # Output results # tmp_output_filepath = hash2uuids_output_filepath + '.WRITING' log.info("Writing hash-to-uuids map to disk: %s", tmp_output_filepath) file_utils.safe_create_dir(os.path.dirname(hash2uuids_output_filepath)) with open(tmp_output_filepath, 'wb') as f: cPickle.dump(hash2uuids, f, pickle_protocol) log.info("Moving on top of input: %s", hash2uuids_output_filepath) os.rename(tmp_output_filepath, hash2uuids_output_filepath) log.info("Done")