Exemple #1
0
def main():
    args = cli_parser().parse_args()
    config = cli.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    paths_file = args.paths_file
    after_time = args.after_time
    before_time = args.before_time

    #
    # Check dir/file locations
    #
    if paths_file is None:
        raise ValueError("Need a file path to to output transferred file "
                         "paths!")

    safe_create_dir(os.path.dirname(paths_file))

    #
    # Start collection
    #
    remote_paths = solr_image_paths(config['solr_address'], after_time or '*',
                                    before_time or '*',
                                    config['solr_username'],
                                    config['solr_password'],
                                    config['batch_size'])

    log.info("Writing file paths")
    with open(paths_file, 'w') as of:
        pr = cli.ProgressReporter(log.info, 1.0).start()
        for rp in remote_paths:
            of.write(rp + '\n')
            pr.increment_report()
        pr.report()
Exemple #2
0
def main():
    args = cli_parser().parse_args()
    config = cli.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    api_root = config['tool']['girder_api_root']
    api_key = config['tool']['api_key']
    api_query_batch = config['tool']['api_query_batch']
    insert_batch_size = config['tool']['dataset_insert_batch_size']

    # Collect N folder/item/file references on CL and any files referenced.
    #: :type: list[str]
    ids_folder = args.folder
    #: :type: list[str]
    ids_item = args.item
    #: :type: list[str]
    ids_file = args.file

    if args.folder_list:
        with open(args.folder_list) as f:
            ids_folder.extend([fid.strip() for fid in f])
    if args.item_list:
        with open(args.item_list) as f:
            ids_item.extend([iid.strip() for iid in f])
    if args.file_list:
        with open(args.file_list) as f:
            ids_file.extend([fid.strip() for fid in f])

    #: :type: smqtk.representation.DataSet
    data_set = from_config_dict(config['plugins']['data_set'],
                                DataSet.get_impls())

    batch = collections.deque()
    pr = cli.ProgressReporter(log.info, 1.0).start()
    for e in find_girder_files(api_root, ids_folder, ids_item, ids_file,
                               api_key, api_query_batch):
        batch.append(e)
        if insert_batch_size and len(batch) >= insert_batch_size:
            data_set.add_data(*batch)
            batch.clear()
        pr.increment_report()
    pr.report()

    if batch:
        data_set.add_data(*batch)

    log.info('Done')
Exemple #3
0
def mb_kmeans_build_apply(index, mbkm, initial_fit_size):
    """
    Build the MiniBatchKMeans centroids based on the descriptors in the given
    index, then predicting descriptor clusters with the final result model.

    If the given index is empty, no fitting or clustering occurs and an empty
    dictionary is returned.

    :param index: Index of descriptors
    :type index: smqtk.representation.DescriptorIndex

    :param mbkm: Scikit-Learn MiniBatchKMeans instead to train and then use for
        prediction
    :type mbkm: sklearn.cluster.MiniBatchKMeans

    :param initial_fit_size: Number of descriptors to run an initial fit with.
        This brings the advantage of choosing a best initialization point from
        multiple.
    :type initial_fit_size: int

    :return: Dictionary of the cluster label (integer) to the set of descriptor
        UUIDs belonging to that cluster.
    :rtype: dict[int, set[collections.Hashable]]

    """
    log = logging.getLogger(__name__)

    ifit_completed = False
    k_deque = collections.deque()
    d_fitted = 0

    log.info("Getting index keys (shuffled)")
    index_keys = sorted(six.iterkeys(index))
    numpy.random.seed(mbkm.random_state)
    numpy.random.shuffle(index_keys)

    def parallel_iter_vectors(descriptors):
        """ Get the vectors for the descriptors given.
        Not caring about order returned.
        """
        return parallel.parallel_map(lambda d: d.vector(), descriptors,
                                     use_multiprocessing=False)

    def get_vectors(k_iter):
        """ Get numpy array of descriptor vectors (2D array returned) """
        return numpy.array(list(
            parallel_iter_vectors(index.get_many_descriptors(k_iter))
        ))

    log.info("Collecting iteratively fitting model")
    pr = cli.ProgressReporter(log.debug, 1.0).start()
    for i, k in enumerate(index_keys):
        k_deque.append(k)
        pr.increment_report()

        if initial_fit_size and not ifit_completed:
            if len(k_deque) == initial_fit_size:
                log.info("Initial fit using %d descriptors", len(k_deque))
                log.info("- collecting vectors")
                vectors = get_vectors(k_deque)
                log.info("- fitting model")
                mbkm.fit(vectors)
                log.info("- cleaning")
                d_fitted += len(vectors)
                k_deque.clear()
                ifit_completed = True
        elif len(k_deque) == mbkm.batch_size:
            log.info("Partial fit with batch size %d", len(k_deque))
            log.info("- collecting vectors")
            vectors = get_vectors(k_deque)
            log.info("- fitting model")
            mbkm.partial_fit(vectors)
            log.info("- cleaning")
            d_fitted += len(k_deque)
            k_deque.clear()
    pr.report()

    # Final fit with any remaining descriptors
    if k_deque:
        log.info("Final partial fit of size %d", len(k_deque))
        log.info('- collecting vectors')
        vectors = get_vectors(k_deque)
        log.info('- fitting model')
        mbkm.partial_fit(vectors)
        log.info('- cleaning')
        d_fitted += len(k_deque)
        k_deque.clear()

    log.info("Computing descriptor classes with final KMeans model")
    mbkm.verbose = False
    d_classes = collections.defaultdict(set)
    d_uv_iter = parallel.parallel_map(lambda d: (d.uuid(), d.vector()),
                                      index,
                                      use_multiprocessing=False,
                                      name="uv-collector")
    # TODO: Batch predict call inputs to something larger than one at a time.
    d_uc_iter = parallel.parallel_map(
        lambda u_v: (u_v[0], mbkm.predict(u_v[1][numpy.newaxis, :])[0]),
        d_uv_iter,
        use_multiprocessing=False,
        name="uc-collector")
    pr = cli.ProgressReporter(log.debug, 1.0).start()
    for uuid, c in d_uc_iter:
        d_classes[c].add(uuid)
        pr.increment_report()
    pr.report()

    return d_classes
Exemple #4
0
def compute_hash_codes(uuids, index, functor, report_interval=1.0,
                       use_mp=False, ordered=False):
    """
    Given an iterable of DescriptorElement UUIDs, asynchronously access them
    from the given ``index``, asynchronously compute hash codes via ``functor``
    and convert to an integer, yielding (UUID, hash-int) pairs.

    :param uuids: Sequence of UUIDs to process
    :type uuids: collections.Iterable[collections.Hashable]

    :param index: Descriptor index to pull from.
    :type index: smqtk.representation.descriptor_index.DescriptorIndex

    :param functor: LSH hash code functor instance
    :type functor: smqtk.algorithms.LshFunctor

    :param report_interval: Frequency in seconds at which we report speed and
        completion progress via logging. Reporting is disabled when logging
        is not in debug and this value is greater than 0.
    :type report_interval: float

    :param use_mp: If multiprocessing should be used for parallel
        computation vs. threading. Reminder: This will copy currently loaded
        objects onto worker processes (e.g. the given index), which could lead
        to dangerously high RAM consumption.
    :type use_mp: bool

    :param ordered: If the element-hash value pairs yielded are in the same
        order as element UUID values input. This function should be slightly
        faster when ordering is not required.
    :type ordered: bool

    :return: Generator instance yielding (DescriptorElement, int) value pairs.

    """
    # TODO: parallel map fetch elements from index?
    #       -> separately from compute

    def get_hash(u):
        v = index.get_descriptor(u).vector()
        return u, bits.bit_vector_to_int_large(functor.get_hash(v))

    # Setup log and reporting function
    log = logging.getLogger(__name__)

    if log.getEffectiveLevel() > logging.DEBUG or report_interval <= 0:
        def log_func(*_, **__):
            return
        log.debug("Not logging progress")
    else:
        log.debug("Logging progress at %f second intervals", report_interval)
        log_func = log.debug

    log.debug("Starting computation")
    reporter = cli.ProgressReporter(log_func, report_interval)
    reporter.start()
    for uuid, hash_int in parallel.parallel_map(get_hash, uuids,
                                                ordered=ordered,
                                                use_multiprocessing=use_mp):
        yield (uuid, hash_int)
        # Progress reporting
        reporter.increment_report()

    # Final report
    reporter.report()
def main():
    args = cli_parser().parse_args()
    config = cli.utility_main_helper(default_config, args)
    log = logging.getLogger(__name__)

    # - parallel_map UUIDs to load from the configured index
    # - classify iterated descriptors

    uuids_list_filepath = args.uuids_list
    output_csv_filepath = args.csv_data
    output_csv_header_filepath = args.csv_header
    classify_overwrite = config['utility']['classify_overwrite']

    p_use_multiprocessing = \
        config['utility']['parallel']['use_multiprocessing']
    p_index_extraction_cores = \
        config['utility']['parallel']['index_extraction_cores']
    p_classification_cores = \
        config['utility']['parallel']['classification_cores']

    if not uuids_list_filepath:
        raise ValueError("No uuids_list_filepath specified.")
    elif not os.path.isfile(uuids_list_filepath):
        raise ValueError("Given uuids_list_filepath did not point to a file.")
    if output_csv_header_filepath is None:
        raise ValueError("Need a path to save CSV header labels")
    if output_csv_filepath is None:
        raise ValueError("Need a path to save CSV data.")

    #
    # Initialize configured plugins
    #

    log.info("Initializing descriptor index")
    #: :type: smqtk.representation.DescriptorSet
    descriptor_set = from_config_dict(config['plugins']['descriptor_set'],
                                      DescriptorSet.get_impls())

    log.info("Initializing classification factory")
    c_factory = ClassificationElementFactory.from_config(
        config['plugins']['classification_factory'])

    log.info("Initializing classifier")
    #: :type: smqtk.algorithms.Classifier
    classifier = from_config_dict(config['plugins']['classifier'],
                                  Classifier.get_impls())

    #
    # Setup/Process
    #
    def iter_uuids():
        with open(uuids_list_filepath) as f:
            for l in f:
                yield l.strip()

    def descr_for_uuid(uuid):
        """
        :type uuid: collections.Hashable
        :rtype: smqtk.representation.DescriptorElement
        """
        return descriptor_set.get_descriptor(uuid)

    def classify_descr(d):
        """
        :type d: smqtk.representation.DescriptorElement
        :rtype: smqtk.representation.ClassificationElement
        """
        return classifier.classify_one_element(d, c_factory,
                                               classify_overwrite)

    log.info("Initializing uuid-to-descriptor parallel map")
    #: :type: collections.Iterable[smqtk.representation.DescriptorElement]
    element_iter = parallel.parallel_map(
        descr_for_uuid,
        iter_uuids(),
        use_multiprocessing=p_use_multiprocessing,
        cores=p_index_extraction_cores,
        name="descr_for_uuid",
    )

    log.info("Initializing descriptor-to-classification parallel map")
    #: :type: collections.Iterable[smqtk.representation.ClassificationElement]
    classification_iter = parallel.parallel_map(
        classify_descr,
        element_iter,
        use_multiprocessing=p_use_multiprocessing,
        cores=p_classification_cores,
        name='classify_descr',
    )

    #
    # Write/Output files
    #

    c_labels = classifier.get_labels()

    def make_row(e):
        """
        :type e: smqtk.representation.ClassificationElement
        """
        c_m = e.get_classification()
        return [e.uuid] + [c_m[l] for l in c_labels]

    # column labels file
    log.info("Writing CSV column header file: %s", output_csv_header_filepath)
    safe_create_dir(os.path.dirname(output_csv_header_filepath))
    with open(output_csv_header_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        w.writerow(['uuid'] + [str(cl) for cl in c_labels])

    # CSV file
    log.info("Writing CSV data file: %s", output_csv_filepath)
    safe_create_dir(os.path.dirname(output_csv_filepath))
    pr = cli.ProgressReporter(log.info, 1.0)
    pr.start()
    with open(output_csv_filepath, 'wb') as f_csv:
        w = csv.writer(f_csv)
        for c in classification_iter:
            w.writerow(make_row(c))
            pr.increment_report()
        pr.report()

    log.info("Done")
Exemple #6
0
    def classify_async(self,
                       d_iter,
                       factory=DFLT_CLASSIFIER_FACTORY,
                       overwrite=False,
                       procs=None,
                       use_multiprocessing=False,
                       ri=None):
        """
        Asynchronously classify the DescriptorElements in the given iterable.

        :param d_iter: Iterable of DescriptorElements
        :type d_iter:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param factory: Classifier element factory to use for element
            generation. The default factory yields MemoryClassificationElement
            instances.
        :type factory: smqtk.representation.ClassificationElementFactory

        :param overwrite: Recompute classification of the input descriptor and
            set the results to the ClassificationElement produced by the
            factory.
        :type overwrite: bool

        :param procs: Explicit number of cores/thread/processes to use.
        :type procs: None | int

        :param use_multiprocessing: Use multiprocessing instead of threading.
        :type use_multiprocessing: bool

        :param ri: Progress reporting interval in seconds. Set to a value > 0 to
            enable. Disabled by default.
        :type ri: float | None

        :return: Mapping of input DescriptorElement instances to the computed
            ClassificationElement. ClassificationElement UUID's are congruent
            with the UUID of the DescriptorElement
        :rtype: dict[smqtk.representation.DescriptorElement,
                     smqtk.representation.ClassificationElement]

        """
        self._log.debug("Async classifying descriptors")
        ri = ri and ri > 0 and ri

        def work(d_elem):
            return d_elem, self.classify(d_elem, factory, overwrite)

        classifications = parallel.parallel_map(
            work,
            d_iter,
            cores=procs,
            ordered=False,
            use_multiprocessing=use_multiprocessing,
        )

        pr = None
        if ri:
            pr = cli.ProgressReporter(self._log.debug, ri).start()

        d2c_map = {}
        for d, c in classifications:
            d2c_map[d] = c
            pr and pr.increment_report()
        pr and pr.report()

        return d2c_map