Exemple #1
0
def compute_hash_codes(task, folderId, **kwargs):
    """
    Celery task for computing hash codes on a given folder (descriptor index).

    :param task: Celery provided task object.
    :param folderId: The folder to train ITQ for, note this is only used to
        infer the descriptor index.
    """
    task.job_manager.updateProgress(message='Computing Hash Codes', forceFlush=True)

    index = descriptorIndexFromFolderId(task.girder_client, folderId)

    smqtkFolder = getCreateFolder(task.girder_client, folderId, '.smqtk')

    meanVecFileId = smqtkFileIdFromName(task.girder_client, smqtkFolder, 'mean_vec.npy')
    rotationFileId = smqtkFileIdFromName(task.girder_client, smqtkFolder, 'rotation.npy')
    hash2uuidsFile = initializeItemWithFile(task.girder_client,
                                            createOverwriteItem(task.girder_client, smqtkFolder['_id'], 'hash2uuids.pickle'))

    functor = ItqFunctor(mean_vec_cache=GirderDataElement(meanVecFileId, api_root=task.request.apiUrl,
                                                          token=task.request.jobInfoSpec['headers']['Girder-Token']),
                         rotation_cache=GirderDataElement(rotationFileId, api_root=task.request.apiUrl,
                                                          token=task.request.jobInfoSpec['headers']['Girder-Token']))

    hash2uuids = compute_functions.compute_hash_codes(index.iterkeys(), index, functor, use_mp=False)

    data = pickle.dumps(dict((y, x) for (x, y) in hash2uuids))
    task.girder_client.uploadFileContents(hash2uuidsFile['_id'], six.BytesIO(data), len(data))
Exemple #2
0
def compute_hash_codes(task, folderId, **kwargs):
    """
    Celery task for computing hash codes on a given folder (descriptor index).

    :param task: Celery provided task object.
    :param folderId: The folder to train ITQ for, note this is only used to
        infer the descriptor index.
    """
    task.job_manager.updateProgress(message='Computing Hash Codes',
                                    forceFlush=True)

    index = descriptorIndexFromFolderId(task.girder_client, folderId)

    smqtkFolder = getCreateFolder(task.girder_client, folderId, '.smqtk')

    meanVecFileId = smqtkFileIdFromName(task.girder_client, smqtkFolder,
                                        'mean_vec.npy')
    rotationFileId = smqtkFileIdFromName(task.girder_client, smqtkFolder,
                                         'rotation.npy')
    hash2uuidsFile = initializeItemWithFile(
        task.girder_client,
        createOverwriteItem(task.girder_client, smqtkFolder['_id'],
                            'hash2uuids.pickle'))

    functor = ItqFunctor(
        mean_vec_cache=GirderDataElement(
            meanVecFileId,
            api_root=task.request.apiUrl,
            token=task.request.jobInfoSpec['headers']['Girder-Token']),
        rotation_cache=GirderDataElement(
            rotationFileId,
            api_root=task.request.apiUrl,
            token=task.request.jobInfoSpec['headers']['Girder-Token']))

    hash2uuids = compute_functions.compute_hash_codes(index.iterkeys(),
                                                      index,
                                                      functor,
                                                      use_mp=False)

    data = pickle.dumps(dict((y, x) for (x, y) in hash2uuids))
    task.girder_client.uploadFileContents(hash2uuidsFile['_id'],
                                          six.BytesIO(data), len(data))
Exemple #3
0
def main():
    args = cli_parser().parse_args()
    config = cli.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    #
    # Load configuration contents
    #
    uuid_list_filepath = args.uuids_list
    report_interval = config['utility']['report_interval']
    use_multiprocessing = config['utility']['use_multiprocessing']

    #
    # Checking input parameters
    #
    if (uuid_list_filepath is not None) and \
            not os.path.isfile(uuid_list_filepath):
        raise ValueError("UUIDs list file does not exist!")

    #
    # Loading stuff
    #
    log.info("Loading descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = from_config_dict(config['plugins']['descriptor_index'],
                                        DescriptorIndex.get_impls())
    log.info("Loading LSH functor")
    #: :type: smqtk.algorithms.LshFunctor
    lsh_functor = from_config_dict(config['plugins']['lsh_functor'],
                                   LshFunctor.get_impls())
    log.info("Loading Key/Value store")
    #: :type: smqtk.representation.KeyValueStore
    hash2uuids_kvstore = from_config_dict(
        config['plugins']['hash2uuid_kvstore'], KeyValueStore.get_impls())

    # Iterate either over what's in the file given, or everything in the
    # configured index.
    def iter_uuids():
        if uuid_list_filepath:
            log.info("Using UUIDs list file")
            with open(uuid_list_filepath) as f:
                for l in f:
                    yield l.strip()
        else:
            log.info("Using all UUIDs resent in descriptor index")
            for k in descriptor_index.keys():
                yield k

    #
    # Compute codes
    #
    log.info("Starting hash code computation")
    kv_update = {}
    for uuid, hash_int in \
            compute_hash_codes(uuids_for_processing(iter_uuids(),
                                                    hash2uuids_kvstore),
                               descriptor_index, lsh_functor,
                               report_interval,
                               use_multiprocessing, True):
        # Get original value in KV-store if not in update dict.
        if hash_int not in kv_update:
            kv_update[hash_int] = hash2uuids_kvstore.get(hash_int, set())
        kv_update[hash_int] |= {uuid}

    if kv_update:
        log.info("Updating KV store... (%d keys)" % len(kv_update))
        hash2uuids_kvstore.add_many(kv_update)

    log.info("Done")
Exemple #4
0
def main():
    args = cli_parser().parse_args()
    config = bin_utils.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    #
    # Load configuration contents
    #
    uuid_list_filepath = args.uuids_list
    report_interval = config['utility']['report_interval']
    use_multiprocessing = config['utility']['use_multiprocessing']

    #
    # Checking input parameters
    #
    if (uuid_list_filepath is not None) and \
            not os.path.isfile(uuid_list_filepath):
        raise ValueError("UUIDs list file does not exist!")

    #
    # Loading stuff
    #
    log.info("Loading descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'],
        get_descriptor_index_impls()
    )
    log.info("Loading LSH functor")
    #: :type: smqtk.algorithms.LshFunctor
    lsh_functor = plugin.from_plugin_config(
        config['plugins']['lsh_functor'],
        get_lsh_functor_impls()
    )
    log.info("Loading Key/Value store")
    #: :type: smqtk.representation.KeyValueStore
    hash2uuids_kvstore = plugin.from_plugin_config(
        config['plugins']['hash2uuid_kvstore'],
        get_key_value_store_impls()
    )

    # Iterate either over what's in the file given, or everything in the
    # configured index.
    def iter_uuids():
        if uuid_list_filepath:
            log.info("Using UUIDs list file")
            with open(uuid_list_filepath) as f:
                for l in f:
                    yield l.strip()
        else:
            log.info("Using all UUIDs resent in descriptor index")
            for k in descriptor_index.keys():
                yield k

    #
    # Compute codes
    #
    log.info("Starting hash code computation")
    kv_update = {}
    for uuid, hash_int in \
            compute_hash_codes(uuids_for_processing(iter_uuids(),
                                                    hash2uuids_kvstore),
                               descriptor_index, lsh_functor,
                               report_interval,
                               use_multiprocessing, True):
        # Get original value in KV-store if not in update dict.
        if hash_int not in kv_update:
            kv_update[hash_int] = hash2uuids_kvstore.get(hash_int, set())
        kv_update[hash_int] |= {uuid}

    if kv_update:
        log.info("Updating KV store... (%d keys)" % len(kv_update))
        hash2uuids_kvstore.add_many(kv_update)

    log.info("Done")
Exemple #5
0
def main():
    description = """
    Compute LSH hash codes based on the provided functor on specific
    descriptors from the configured index given a file-list of UUIDs.

    When using an input file-list of UUIDs, we require that the UUIDs of
    indexed descriptors be strings, or equality comparable to the UUIDs' string
    representation.

    This script can be used to live update the ``hash2uuid_cache_filepath``
    model file for the ``LSHNearestNeighborIndex`` algorithm as output
    dictionary format is the same as used by that implementation.
    """
    args, config = bin_utils.utility_main_helper(default_config, description,
                                                 extend_parser)
    log = logging.getLogger(__name__)

    #
    # Load configuration contents
    #
    uuid_list_filepath = args.uuids_list
    hash2uuids_input_filepath = args.input_hash2uuids
    hash2uuids_output_filepath = args.output_hash2uuids
    report_interval = config['utility']['report_interval']
    use_multiprocessing = config['utility']['use_multiprocessing']
    pickle_protocol = config['utility']['pickle_protocol']

    #
    # Checking parameters
    #
    if not hash2uuids_output_filepath:
        raise ValueError("No hash2uuids map output file provided!")

    #
    # Loading stuff
    #
    log.info("Loading descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'],
        get_descriptor_index_impls()
    )
    log.info("Loading LSH functor")
    #: :type: smqtk.algorithms.LshFunctor
    lsh_functor = plugin.from_plugin_config(
        config['plugins']['lsh_functor'],
        get_lsh_functor_impls()
    )

    def iter_uuids():
        if uuid_list_filepath:
            log.info("Using UUIDs list file")
            with open(uuid_list_filepath) as f:
                for l in f:
                    yield l.strip()
        else:
            log.info("Using all UUIDs resent in descriptor index")
            for k in descriptor_index.iterkeys():
                yield k

    # load map if it exists, else start with empty dictionary
    if hash2uuids_input_filepath and os.path.isfile(hash2uuids_input_filepath):
        log.info("Loading hash2uuids mapping")
        with open(hash2uuids_input_filepath) as f:
            hash2uuids = cPickle.load(f)
    else:
        log.info("Creating new hash2uuids mapping for output")
        hash2uuids = {}

    #
    # Compute codes
    #
    log.info("Starting hash code computation")
    compute_hash_codes(
        uuids_for_processing(iter_uuids(), hash2uuids),
        descriptor_index,
        lsh_functor,
        hash2uuids,
        report_interval=report_interval,
        use_mp=use_multiprocessing,
    )

    #
    # Output results
    #
    tmp_output_filepath = hash2uuids_output_filepath + '.WRITING'
    log.info("Writing hash-to-uuids map to disk: %s", tmp_output_filepath)
    file_utils.safe_create_dir(os.path.dirname(hash2uuids_output_filepath))
    with open(tmp_output_filepath, 'wb') as f:
        cPickle.dump(hash2uuids, f, pickle_protocol)
    log.info("Moving on top of input: %s", hash2uuids_output_filepath)
    os.rename(tmp_output_filepath, hash2uuids_output_filepath)
    log.info("Done")
Exemple #6
0
def main():
    description = """
    Compute LSH hash codes based on the provided functor on specific
    descriptors from the configured index given a file-list of UUIDs.

    When using an input file-list of UUIDs, we require that the UUIDs of
    indexed descriptors be strings, or equality comparable to the UUIDs' string
    representation.

    This script can be used to live update the ``hash2uuid_cache_filepath``
    model file for the ``LSHNearestNeighborIndex`` algorithm as output
    dictionary format is the same as used by that implementation.
    """
    args, config = bin_utils.utility_main_helper(default_config, description,
                                                 extend_parser)
    log = logging.getLogger(__name__)

    #
    # Load configuration contents
    #
    uuid_list_filepath = args.uuids_list
    hash2uuids_input_filepath = args.input_hash2uuids
    hash2uuids_output_filepath = args.output_hash2uuids
    report_interval = config['utility']['report_interval']
    use_multiprocessing = config['utility']['use_multiprocessing']
    pickle_protocol = config['utility']['pickle_protocol']

    #
    # Checking parameters
    #
    if not hash2uuids_output_filepath:
        raise ValueError("No hash2uuids map output file provided!")

    #
    # Loading stuff
    #
    log.info("Loading descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(
        config['plugins']['descriptor_index'], get_descriptor_index_impls())
    log.info("Loading LSH functor")
    #: :type: smqtk.algorithms.LshFunctor
    lsh_functor = plugin.from_plugin_config(config['plugins']['lsh_functor'],
                                            get_lsh_functor_impls())

    def iter_uuids():
        if uuid_list_filepath:
            log.info("Using UUIDs list file")
            with open(uuid_list_filepath) as f:
                for l in f:
                    yield l.strip()
        else:
            log.info("Using all UUIDs resent in descriptor index")
            for k in descriptor_index.iterkeys():
                yield k

    # load map if it exists, else start with empty dictionary
    if hash2uuids_input_filepath and os.path.isfile(hash2uuids_input_filepath):
        log.info("Loading hash2uuids mapping")
        with open(hash2uuids_input_filepath) as f:
            hash2uuids = cPickle.load(f)
    else:
        log.info("Creating new hash2uuids mapping for output")
        hash2uuids = {}

    #
    # Compute codes
    #
    log.info("Starting hash code computation")
    compute_hash_codes(
        uuids_for_processing(iter_uuids(), hash2uuids),
        descriptor_index,
        lsh_functor,
        hash2uuids,
        report_interval=report_interval,
        use_mp=use_multiprocessing,
    )

    #
    # Output results
    #
    tmp_output_filepath = hash2uuids_output_filepath + '.WRITING'
    log.info("Writing hash-to-uuids map to disk: %s", tmp_output_filepath)
    file_utils.safe_create_dir(os.path.dirname(hash2uuids_output_filepath))
    with open(tmp_output_filepath, 'wb') as f:
        cPickle.dump(hash2uuids, f, pickle_protocol)
    log.info("Moving on top of input: %s", hash2uuids_output_filepath)
    os.rename(tmp_output_filepath, hash2uuids_output_filepath)
    log.info("Done")