def main(): args = cli_parser().parse_args() config = cli.utility_main_helper(default_config, args) log = logging.getLogger(__name__) paths_file = args.paths_file after_time = args.after_time before_time = args.before_time # # Check dir/file locations # if paths_file is None: raise ValueError("Need a file path to to output transferred file " "paths!") safe_create_dir(os.path.dirname(paths_file)) # # Start collection # remote_paths = solr_image_paths(config['solr_address'], after_time or '*', before_time or '*', config['solr_username'], config['solr_password'], config['batch_size']) log.info("Writing file paths") with open(paths_file, 'w') as of: pr = cli.ProgressReporter(log.info, 1.0).start() for rp in remote_paths: of.write(rp + '\n') pr.increment_report() pr.report()
def main(): args = cli_parser().parse_args() config = cli.utility_main_helper(default_config, args) log = logging.getLogger(__name__) api_root = config['tool']['girder_api_root'] api_key = config['tool']['api_key'] api_query_batch = config['tool']['api_query_batch'] insert_batch_size = config['tool']['dataset_insert_batch_size'] # Collect N folder/item/file references on CL and any files referenced. #: :type: list[str] ids_folder = args.folder #: :type: list[str] ids_item = args.item #: :type: list[str] ids_file = args.file if args.folder_list: with open(args.folder_list) as f: ids_folder.extend([fid.strip() for fid in f]) if args.item_list: with open(args.item_list) as f: ids_item.extend([iid.strip() for iid in f]) if args.file_list: with open(args.file_list) as f: ids_file.extend([fid.strip() for fid in f]) #: :type: smqtk.representation.DataSet data_set = from_config_dict(config['plugins']['data_set'], DataSet.get_impls()) batch = collections.deque() pr = cli.ProgressReporter(log.info, 1.0).start() for e in find_girder_files(api_root, ids_folder, ids_item, ids_file, api_key, api_query_batch): batch.append(e) if insert_batch_size and len(batch) >= insert_batch_size: data_set.add_data(*batch) batch.clear() pr.increment_report() pr.report() if batch: data_set.add_data(*batch) log.info('Done')
def mb_kmeans_build_apply(index, mbkm, initial_fit_size): """ Build the MiniBatchKMeans centroids based on the descriptors in the given index, then predicting descriptor clusters with the final result model. If the given index is empty, no fitting or clustering occurs and an empty dictionary is returned. :param index: Index of descriptors :type index: smqtk.representation.DescriptorIndex :param mbkm: Scikit-Learn MiniBatchKMeans instead to train and then use for prediction :type mbkm: sklearn.cluster.MiniBatchKMeans :param initial_fit_size: Number of descriptors to run an initial fit with. This brings the advantage of choosing a best initialization point from multiple. :type initial_fit_size: int :return: Dictionary of the cluster label (integer) to the set of descriptor UUIDs belonging to that cluster. :rtype: dict[int, set[collections.Hashable]] """ log = logging.getLogger(__name__) ifit_completed = False k_deque = collections.deque() d_fitted = 0 log.info("Getting index keys (shuffled)") index_keys = sorted(six.iterkeys(index)) numpy.random.seed(mbkm.random_state) numpy.random.shuffle(index_keys) def parallel_iter_vectors(descriptors): """ Get the vectors for the descriptors given. Not caring about order returned. """ return parallel.parallel_map(lambda d: d.vector(), descriptors, use_multiprocessing=False) def get_vectors(k_iter): """ Get numpy array of descriptor vectors (2D array returned) """ return numpy.array(list( parallel_iter_vectors(index.get_many_descriptors(k_iter)) )) log.info("Collecting iteratively fitting model") pr = cli.ProgressReporter(log.debug, 1.0).start() for i, k in enumerate(index_keys): k_deque.append(k) pr.increment_report() if initial_fit_size and not ifit_completed: if len(k_deque) == initial_fit_size: log.info("Initial fit using %d descriptors", len(k_deque)) log.info("- collecting vectors") vectors = get_vectors(k_deque) log.info("- fitting model") mbkm.fit(vectors) log.info("- cleaning") d_fitted += len(vectors) k_deque.clear() ifit_completed = True elif len(k_deque) == mbkm.batch_size: log.info("Partial fit with batch size %d", len(k_deque)) log.info("- collecting vectors") vectors = get_vectors(k_deque) log.info("- fitting model") mbkm.partial_fit(vectors) log.info("- cleaning") d_fitted += len(k_deque) k_deque.clear() pr.report() # Final fit with any remaining descriptors if k_deque: log.info("Final partial fit of size %d", len(k_deque)) log.info('- collecting vectors') vectors = get_vectors(k_deque) log.info('- fitting model') mbkm.partial_fit(vectors) log.info('- cleaning') d_fitted += len(k_deque) k_deque.clear() log.info("Computing descriptor classes with final KMeans model") mbkm.verbose = False d_classes = collections.defaultdict(set) d_uv_iter = parallel.parallel_map(lambda d: (d.uuid(), d.vector()), index, use_multiprocessing=False, name="uv-collector") # TODO: Batch predict call inputs to something larger than one at a time. d_uc_iter = parallel.parallel_map( lambda u_v: (u_v[0], mbkm.predict(u_v[1][numpy.newaxis, :])[0]), d_uv_iter, use_multiprocessing=False, name="uc-collector") pr = cli.ProgressReporter(log.debug, 1.0).start() for uuid, c in d_uc_iter: d_classes[c].add(uuid) pr.increment_report() pr.report() return d_classes
def compute_hash_codes(uuids, index, functor, report_interval=1.0, use_mp=False, ordered=False): """ Given an iterable of DescriptorElement UUIDs, asynchronously access them from the given ``index``, asynchronously compute hash codes via ``functor`` and convert to an integer, yielding (UUID, hash-int) pairs. :param uuids: Sequence of UUIDs to process :type uuids: collections.Iterable[collections.Hashable] :param index: Descriptor index to pull from. :type index: smqtk.representation.descriptor_index.DescriptorIndex :param functor: LSH hash code functor instance :type functor: smqtk.algorithms.LshFunctor :param report_interval: Frequency in seconds at which we report speed and completion progress via logging. Reporting is disabled when logging is not in debug and this value is greater than 0. :type report_interval: float :param use_mp: If multiprocessing should be used for parallel computation vs. threading. Reminder: This will copy currently loaded objects onto worker processes (e.g. the given index), which could lead to dangerously high RAM consumption. :type use_mp: bool :param ordered: If the element-hash value pairs yielded are in the same order as element UUID values input. This function should be slightly faster when ordering is not required. :type ordered: bool :return: Generator instance yielding (DescriptorElement, int) value pairs. """ # TODO: parallel map fetch elements from index? # -> separately from compute def get_hash(u): v = index.get_descriptor(u).vector() return u, bits.bit_vector_to_int_large(functor.get_hash(v)) # Setup log and reporting function log = logging.getLogger(__name__) if log.getEffectiveLevel() > logging.DEBUG or report_interval <= 0: def log_func(*_, **__): return log.debug("Not logging progress") else: log.debug("Logging progress at %f second intervals", report_interval) log_func = log.debug log.debug("Starting computation") reporter = cli.ProgressReporter(log_func, report_interval) reporter.start() for uuid, hash_int in parallel.parallel_map(get_hash, uuids, ordered=ordered, use_multiprocessing=use_mp): yield (uuid, hash_int) # Progress reporting reporter.increment_report() # Final report reporter.report()
def main(): args = cli_parser().parse_args() config = cli.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # - parallel_map UUIDs to load from the configured index # - classify iterated descriptors uuids_list_filepath = args.uuids_list output_csv_filepath = args.csv_data output_csv_header_filepath = args.csv_header classify_overwrite = config['utility']['classify_overwrite'] p_use_multiprocessing = \ config['utility']['parallel']['use_multiprocessing'] p_index_extraction_cores = \ config['utility']['parallel']['index_extraction_cores'] p_classification_cores = \ config['utility']['parallel']['classification_cores'] if not uuids_list_filepath: raise ValueError("No uuids_list_filepath specified.") elif not os.path.isfile(uuids_list_filepath): raise ValueError("Given uuids_list_filepath did not point to a file.") if output_csv_header_filepath is None: raise ValueError("Need a path to save CSV header labels") if output_csv_filepath is None: raise ValueError("Need a path to save CSV data.") # # Initialize configured plugins # log.info("Initializing descriptor index") #: :type: smqtk.representation.DescriptorSet descriptor_set = from_config_dict(config['plugins']['descriptor_set'], DescriptorSet.get_impls()) log.info("Initializing classification factory") c_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory']) log.info("Initializing classifier") #: :type: smqtk.algorithms.Classifier classifier = from_config_dict(config['plugins']['classifier'], Classifier.get_impls()) # # Setup/Process # def iter_uuids(): with open(uuids_list_filepath) as f: for l in f: yield l.strip() def descr_for_uuid(uuid): """ :type uuid: collections.Hashable :rtype: smqtk.representation.DescriptorElement """ return descriptor_set.get_descriptor(uuid) def classify_descr(d): """ :type d: smqtk.representation.DescriptorElement :rtype: smqtk.representation.ClassificationElement """ return classifier.classify_one_element(d, c_factory, classify_overwrite) log.info("Initializing uuid-to-descriptor parallel map") #: :type: collections.Iterable[smqtk.representation.DescriptorElement] element_iter = parallel.parallel_map( descr_for_uuid, iter_uuids(), use_multiprocessing=p_use_multiprocessing, cores=p_index_extraction_cores, name="descr_for_uuid", ) log.info("Initializing descriptor-to-classification parallel map") #: :type: collections.Iterable[smqtk.representation.ClassificationElement] classification_iter = parallel.parallel_map( classify_descr, element_iter, use_multiprocessing=p_use_multiprocessing, cores=p_classification_cores, name='classify_descr', ) # # Write/Output files # c_labels = classifier.get_labels() def make_row(e): """ :type e: smqtk.representation.ClassificationElement """ c_m = e.get_classification() return [e.uuid] + [c_m[l] for l in c_labels] # column labels file log.info("Writing CSV column header file: %s", output_csv_header_filepath) safe_create_dir(os.path.dirname(output_csv_header_filepath)) with open(output_csv_header_filepath, 'wb') as f_csv: w = csv.writer(f_csv) w.writerow(['uuid'] + [str(cl) for cl in c_labels]) # CSV file log.info("Writing CSV data file: %s", output_csv_filepath) safe_create_dir(os.path.dirname(output_csv_filepath)) pr = cli.ProgressReporter(log.info, 1.0) pr.start() with open(output_csv_filepath, 'wb') as f_csv: w = csv.writer(f_csv) for c in classification_iter: w.writerow(make_row(c)) pr.increment_report() pr.report() log.info("Done")
def classify_async(self, d_iter, factory=DFLT_CLASSIFIER_FACTORY, overwrite=False, procs=None, use_multiprocessing=False, ri=None): """ Asynchronously classify the DescriptorElements in the given iterable. :param d_iter: Iterable of DescriptorElements :type d_iter: collections.Iterable[smqtk.representation.DescriptorElement] :param factory: Classifier element factory to use for element generation. The default factory yields MemoryClassificationElement instances. :type factory: smqtk.representation.ClassificationElementFactory :param overwrite: Recompute classification of the input descriptor and set the results to the ClassificationElement produced by the factory. :type overwrite: bool :param procs: Explicit number of cores/thread/processes to use. :type procs: None | int :param use_multiprocessing: Use multiprocessing instead of threading. :type use_multiprocessing: bool :param ri: Progress reporting interval in seconds. Set to a value > 0 to enable. Disabled by default. :type ri: float | None :return: Mapping of input DescriptorElement instances to the computed ClassificationElement. ClassificationElement UUID's are congruent with the UUID of the DescriptorElement :rtype: dict[smqtk.representation.DescriptorElement, smqtk.representation.ClassificationElement] """ self._log.debug("Async classifying descriptors") ri = ri and ri > 0 and ri def work(d_elem): return d_elem, self.classify(d_elem, factory, overwrite) classifications = parallel.parallel_map( work, d_iter, cores=procs, ordered=False, use_multiprocessing=use_multiprocessing, ) pr = None if ri: pr = cli.ProgressReporter(self._log.debug, ri).start() d2c_map = {} for d, c in classifications: d2c_map[d] = c pr and pr.increment_report() pr and pr.report() return d2c_map