def test_nested_threading(self): # char -> char -> ord -> char -> ord g1 = parallel_map(lambda e: e, self.test_string, ordered=True, use_multiprocessing=False, cores=2, name='g1') g2 = parallel_map(ord, g1, ordered=True, use_multiprocessing=False, cores=2, name='g2') g3 = parallel_map(chr, g2, ordered=True, use_multiprocessing=False, cores=2, name='g3') g4 = parallel_map(ord, g3, ordered=True, use_multiprocessing=False, cores=2, name='g4') expected = list(map(ord, self.test_string)) self.assertEqual( list(g4), expected )
def test_nested_multiprocessing(self): # char -> char -> ord -> char -> ord g1 = parallel_map(lambda e: e, self.test_string, ordered=True, use_multiprocessing=True, cores=2) g2 = parallel_map(ord, g1, ordered=True, use_multiprocessing=True, cores=2) g3 = parallel_map(chr, g2, ordered=True, use_multiprocessing=True, cores=2) g4 = parallel_map(ord, g3, ordered=True, use_multiprocessing=True, cores=2) expected = list(map(ord, self.test_string)) self.assertEqual(list(g4), expected)
def test_nested_threading(self): # char -> char -> ord -> char -> ord g1 = parallel_map(lambda e: e, self.test_string, ordered=True, use_multiprocessing=False, cores=2, name='g1') g2 = parallel_map(ord, g1, ordered=True, use_multiprocessing=False, cores=2, name='g2') g3 = parallel_map(chr, g2, ordered=True, use_multiprocessing=False, cores=2, name='g3') g4 = parallel_map(ord, g3, ordered=True, use_multiprocessing=False, cores=2, name='g4') expected = map(ord, self.test_string) nose.tools.assert_equal(list(g4), expected)
def iter_valid_elements(dataElementUris, valid_content_types): """ Find the GirderDataElements which are loadable images and valid according to valid_content_types. :param dataElementUris: A list of Girder Data Element URIs. :param valid_content_types: A list of valid content types, generally passed by a descriptor generator. :returns: A generator over valid GirderDataElements. :rtype: generator """ def is_valid(dataElementUri): dfe = GirderDataElement.from_uri(dataElementUri) if is_valid_element(dfe, valid_content_types=valid_content_types, check_image=True): return dfe else: return False return itertools.ifilter( None, parallel.parallel_map(is_valid, dataElementUris, use_multiprocessing=False))
def test_exception_handing_multiprocess(self): def raise_ex(_): raise RuntimeError("Expected exception") nose.tools.assert_raises( RuntimeError, list, parallel_map(raise_ex, [1], use_multiprocessing=True))
def iter_valid_elements(): def is_valid(file_path): e = DataFileElement(file_path) if is_valid_element( e, valid_content_types=generator.valid_content_types(), check_image=check_image): return e else: return False data_elements: Deque[DataFileElement] = collections.deque() valid_files_filter = parallel.parallel_map(is_valid, file_paths, name="check-file-type", use_multiprocessing=True) for dfe in valid_files_filter: if dfe: yield dfe if data_set is not None: data_elements.append(dfe) if batch_size and len(data_elements) == batch_size: log.debug( "Adding data element batch to set (size: %d)", len(data_elements)) data_set.add_data(*data_elements) data_elements.clear() # elements only collected if we have a data-set configured, so add any # still in the deque to the set if data_set is not None and data_elements: log.debug("Adding data elements to set (size: %d", len(data_elements)) data_set.add_data(*data_elements)
def _get_many_vectors(cls, descriptors): """ Internal method to be overridden by subclasses to return many vectors associated with given descriptors. :note: Returned vectors are *not* guaranteed to be returned in the order they are requested. Missing vectors may be returned as None or omitted entirely from results. The wrapper function `get_many_vectors` handles re-ordering as necessary and insertion of None for missing values. :param descriptors: Iterable of descriptors to query for. :type descriptors: collections.Iterable[ smqtk.representation.descriptor_element.DescriptorElement] :return: Iterator of tuples containing the descriptor uuid and the vector associated with the given descriptors or None if the descriptor has no associated vector :rtype: collections.Iterable[ tuple[collections.Hashable, Union[numpy.ndarray, None]]] """ for uuid_vector_pair in parallel_map(_uuid_and_vector_from_descriptor, descriptors, name='retrieve_vectors'): yield uuid_vector_pair
def test_exception_handing_threaded(self): def raise_ex(_): raise RuntimeError("Expected exception") self.assertRaises( RuntimeError, list, parallel_map(raise_ex, [1], use_multiprocessing=False))
def parallel_iter_vectors(descriptors): """ Get the vectors for the descriptors given. Not caring about order returned. """ return parallel.parallel_map(lambda d: d.vector(), descriptors, use_multiprocessing=False)
def test_simple_unordered_threaded(self): r = list( parallel_map(self.test_func, self.test_string, ordered=False, use_multiprocessing=False)) self.assertEqual(set(r), set(self.expected))
def test_simple_ordered_multiprocess(self): r = list( parallel_map(self.test_func, self.test_string, ordered=True, use_multiprocessing=True)) self.assertEqual(r, self.expected)
def test_simple_unordered_multiprocess(self): r = list( parallel_map(self.test_func, self.test_string, ordered=False, use_multiprocessing=True)) nose.tools.assert_equal(set(r), set(self.expected))
def iter_valid_elements(): valid_files_filter = parallel.parallel_map(is_valid_element, file_paths, name="check-file-type", use_multiprocessing=True) for dfe in valid_files_filter: if dfe is not None: yield dfe
def test_simple_ordered_threaded(self): # Make sure results are still in order as requested r = list( parallel_map(self.test_func, self.test_string, ordered=True, use_multiprocessing=False)) nose.tools.assert_equal(r, self.expected)
def iter_valid_elements(): valid_files_filter = parallel.parallel_map(is_valid_element, file_paths, name="check-file-type", use_multiprocessing=True) for dfe in valid_files_filter: if dfe is not None: yield dfe
def compute_descriptor_async(self, data_iter, descr_factory=DFLT_DESCRIPTOR_FACTORY, overwrite=False, procs=None, **kwds): """ Asynchronously compute feature data for multiple data items. Base implementation additional keyword arguments: use_mp [= False] If multi-processing should be used vs. multi-threading. :param data_iter: Iterable of data elements to compute features for. These must have UIDs assigned for feature association in return value. :type data_iter: collections.Iterable[smqtk.representation.DataElement] :param descr_factory: Factory instance to produce the wrapping descriptor element instance. The default factory produces ``DescriptorMemoryElement`` instances by default. :type descr_factory: smqtk.representation.DescriptorElementFactory :param overwrite: Whether or not to force re-computation of a descriptor vectors for the given data even when there exists precomputed vectors in the generated DescriptorElements as generated from the provided factory. This will overwrite the persistently stored vectors if the provided factory produces a DescriptorElement implementation such storage. :type overwrite: bool :param procs: Optional specification of how many processors to use when pooling sub-tasks. If None, we attempt to use all available cores. :type procs: int | None :raises ValueError: An input DataElement was of a content type that we cannot handle. :return: Mapping of input DataElement UUIDs to the computed descriptor element for that data. DescriptorElement UUID's are congruent with the UUID of the data element it is the descriptor of. :rtype: dict[collections.Hashable, smqtk.representation.DescriptorElement] """ self._log.info("Async compute features") use_mp = kwds.get('use_mp', False) def work(d): return d, self.compute_descriptor(d, descr_factory, overwrite) results = parallel_map(work, data_iter, cores=procs, ordered=False, use_multiprocessing=use_mp) de_map = {} for data, descriptor in results: de_map[data.uuid()] = descriptor return de_map
def compute_descriptor_async(self, data_iter, descr_factory=DFLT_DESCRIPTOR_FACTORY, overwrite=False, procs=None, **kwds): """ Asynchronously compute feature data for multiple data items. Base implementation additional keyword arguments: use_mp [= False] If multi-processing should be used vs. multi-threading. :param data_iter: Iterable of data elements to compute features for. These must have UIDs assigned for feature association in return value. :type data_iter: collections.Iterable[smqtk.representation.DataElement] :param descr_factory: Factory instance to produce the wrapping descriptor element instance. The default factory produces ``DescriptorMemoryElement`` instances by default. :type descr_factory: smqtk.representation.DescriptorElementFactory :param overwrite: Whether or not to force re-computation of a descriptor vectors for the given data even when there exists precomputed vectors in the generated DescriptorElements as generated from the provided factory. This will overwrite the persistently stored vectors if the provided factory produces a DescriptorElement implementation such storage. :type overwrite: bool :param procs: Optional specification of how many processors to use when pooling sub-tasks. If None, we attempt to use all available cores. :type procs: int | None :raises ValueError: An input DataElement was of a content type that we cannot handle. :return: Mapping of input DataElement UUIDs to the computed descriptor element for that data. DescriptorElement UUID's are congruent with the UUID of the data element it is the descriptor of. :rtype: dict[collections.Hashable, smqtk.representation.DescriptorElement] """ self._log.info("Async compute features") use_mp = kwds.get('use_mp', False) def work(d): return d, self.compute_descriptor(d, descr_factory, overwrite) results = parallel_map(work, data_iter, cores=procs, ordered=False, use_multiprocessing=use_mp) de_map = {} for data, descriptor in results: de_map[data.uuid()] = descriptor return de_map
def test_exception_handing_threaded(self): def raise_ex(_): raise RuntimeError("Expected exception") nose.tools.assert_raises( RuntimeError, list, parallel_map(raise_ex, [1], use_multiprocessing=False) )
def test_exception_handing_multiprocess(self): def raise_ex(_): raise RuntimeError("Expected exception") self.assertRaises( RuntimeError, list, parallel_map(raise_ex, [1], use_multiprocessing=True) )
def test_multisequence(self): def test_func(a, b, c): return a + b + c s1 = [1] * 10 s2 = [2] * 10 s3 = [3] * 10 r = list(parallel_map(test_func, s1, s2, s3, use_multiprocessing=False)) expected = [6] * 10 nose.tools.assert_equal(r, expected)
def test_multisequence(self): def test_func(a, b, c): return a + b + c s1 = [1] * 10 s2 = [2] * 10 s3 = [3] * 10 r = list(parallel_map(test_func, s1, s2, s3, use_multiprocessing=False)) expected = [6] * 10 self.assertEqual(r, expected)
def test_multisequence_short_cutoff(self): def test_func(a, b, c): return a + b + c s1 = [1] * 10 s2 = [2] * 4 s3 = [3] * 10 r = list(parallel_map(test_func, s1, s2, s3, use_multiprocessing=False, ordered=True)) exp = [6] * 4 nose.tools.assert_equal(r, exp)
def test_multisequence_short_cutoff(self): def test_func(a, b, c): return a + b + c s1 = [1] * 10 s2 = [2] * 4 s3 = [3] * 10 r = list(parallel_map(test_func, s1, s2, s3, use_multiprocessing=False, ordered=True)) exp = [6] * 4 self.assertEqual(r, exp)
def build_index(self, descriptors): """ Build the index based on the given iterable of descriptor elements. Subsequent calls to this method should rebuild the index, not add to it. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # ordered cache of descriptors in our index. self._descr_cache = [] # Reverse mapping of a descriptor's vector to its index in the cache # and subsequently in the distance kernel. self._descr2index = {} # matrix for creating distance kernel self._descr_matrix = [] def get_vector(d_elem): return d_elem, d_elem.vector() # noinspection PyTypeChecker vector_iter = parallel_map(get_vector, descriptors, name='vector_iter', use_multiprocessing=self.multiprocess_fetch, cores=self.cores, ordered=True) for i, (d, v) in enumerate(vector_iter): self._descr_cache.append(d) # ``_descr_matrix`` is a list, currently. # noinspection PyUnresolvedReferences self._descr_matrix.append(v) self._descr2index[tuple(v)] = i self._descr_matrix = numpy.array(self._descr_matrix) # TODO: (?) For when we optimize SVM SV kernel computation # self._dist_kernel = \ # compute_distance_kernel(self._descr_matrix, # histogram_intersection_distance2, # row_wise=True) if self.descr_cache_fp: with open(self.descr_cache_fp, 'wb') as f: pickle.dump(self._descr_cache, f, -1)
def test_multisequence_fill_void(self): def test_func(a, b, c): return a + b + c s1 = [1] * 10 s2 = [2] * 4 s3 = [3] * 10 r = list(parallel_map(test_func, s1, s2, s3, use_multiprocessing=False, fill_void=10, ordered=True)) expected = [6] * 4 + [14] * 6 nose.tools.assert_equal(r, expected)
def test_multisequence_fill_void(self): def test_func(a, b, c): return a + b + c s1 = [1] * 10 s2 = [2] * 4 s3 = [3] * 10 r = list(parallel_map(test_func, s1, s2, s3, use_multiprocessing=False, fill_void=10, ordered=True)) expected = [6] * 4 + [14] * 6 self.assertEqual(r, expected)
def build_index(self, descriptors): """ Build the index based on the given iterable of descriptor elements. Subsequent calls to this method should rebuild the index, not add to it. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ # ordered cache of descriptors in our index. self._descr_cache = [] # Reverse mapping of a descriptor's vector to its index in the cache # and subsequently in the distance kernel. self._descr2index = {} # matrix for creating distance kernel self._descr_matrix = [] def get_vector(d_elem): return d_elem, d_elem.vector() # noinspection PyTypeChecker vector_iter = parallel_map(get_vector, descriptors, name='vector_iter', use_multiprocessing=self.multiprocess_fetch, cores=self.cores, ordered=True) for i, (d, v) in enumerate(vector_iter): self._descr_cache.append(d) # ``_descr_matrix`` is a list, currently. # noinspection PyUnresolvedReferences self._descr_matrix.append(v) self._descr2index[tuple(v)] = i self._descr_matrix = numpy.array(self._descr_matrix) # TODO: (?) For when we optimize SVM SV kernel computation # self._dist_kernel = \ # compute_distance_kernel(self._descr_matrix, # histogram_intersection_distance2, # row_wise=True) if self.descr_cache_fp: with open(self.descr_cache_fp, 'wb') as f: pickle.dump(self._descr_cache, f, -1)
def test_nested_multiprocessing(self): # char -> char -> ord -> char -> ord g1 = parallel_map(lambda e: e, self.test_string, ordered=True, use_multiprocessing=True, cores=2) g2 = parallel_map(ord, g1, ordered=True, use_multiprocessing=True, cores=2) g3 = parallel_map(chr, g2, ordered=True, use_multiprocessing=True, cores=2) g4 = parallel_map(ord, g3, ordered=True, use_multiprocessing=True, cores=2) expected = map(ord, self.test_string) nose.tools.assert_equal( list(g4), expected )
def main(): # Print help and exit if no arguments were passed if len(sys.argv) == 1: get_cli_parser().print_help() sys.exit(1) args = get_cli_parser().parse_args() llevel = logging.INFO if not args.verbose else logging.DEBUG initialize_logging(logging.getLogger('smqtk'), llevel) initialize_logging(logging.getLogger('__main__'), llevel) log = logging.getLogger(__name__) log.debug('Showing debug messages.') if args.file_list is not None and not os.path.exists(args.file_list): log.error('Invalid file list path: %s', args.file_list) exit(103) def check_image(image_path): if not os.path.exists(image_path): log.warn('Invalid image path given (does not exist): %s', image_path) return False, False else: d = DataFileElement(image_path) return is_valid_element(d, check_image=True), d with open(args.file_list) as infile: checked_images = parallel.parallel_map(check_image, map(str.strip, infile), name='check-image-validity', use_multiprocessing=True) for is_valid, dfe in checked_images: if dfe: # in the case of a non-existent file if (is_valid and not args.invert) or \ (not is_valid and args.invert): # We know the callback above is creating DataFileElement # instances. # noinspection PyProtectedMember print('%s,%s' % (dfe._filepath, dfe.uuid()))
def iter_valid_elements(dataElementUris, valid_content_types): """ Find the GirderDataElements which are loadable images and valid according to valid_content_types. :param dataElementUris: A list of Girder Data Element URIs. :param valid_content_types: A list of valid content types, generally passed by a descriptor generator. :returns: A generator over valid GirderDataElements. :rtype: generator """ def is_valid(dataElementUri): dfe = GirderDataElement.from_uri(dataElementUri) if is_valid_element(dfe, valid_content_types=valid_content_types, check_image=True): return dfe else: return False return itertools.ifilter(None, parallel.parallel_map(is_valid, dataElementUris, use_multiprocessing=False))
while i < total: b_start = i b_end = i + batch_size for h in q[b_start:b_end].execute(): # noinspection PyProtectedMember yield h.meta._d_ i += 1 except elasticsearch.ConnectionTimeout, ex: log.warning("ElasticSearch timed out (error = %s)", str(ex)) restart = True log.debug("Restarting query from index %d", i) log.info("Initializing image download/record parallel iterator") img_dl_records = parallel_map( dl_image, iter_scan_meta(), name='image_download', use_multiprocessing=True, cores=cores ) # Write out log.info("Starting iteration/file-write") rp_state = [0] * 7 with open(scan_record, 'w') as record_file: for r in img_dl_records: if r is not None: cdr_id, local_path, uuid = r record_file.write('%s,%s,%s\n' % (cdr_id, local_path, uuid)) report_progress(log.debug, rp_state, 1.0) # Final report rp_state[1] -= 1
def main(): description = """ Script for asynchronously computing classifications for DescriptorElements in a DescriptorIndex specified via a list of UUIDs. Results are output to a CSV file in the format: uuid, label1_confidence, label2_confidence, ... CSV columns labels are output to the given CSV header file path. Label columns will be in the order as reported by the classifier implementations ``get_labels`` method. Due to using an input file-list of UUIDs, we require that the UUIDs of indexed descriptors be strings, or equality comparable to the UUIDs' string representation. """ args, config = bin_utils.utility_main_helper( default_config, description, extend_parser, ) log = logging.getLogger(__name__) # - parallel_map UUIDs to load from the configured index # - classify iterated descriptors uuids_list_filepath = args.uuids_list output_csv_filepath = args.csv_data output_csv_header_filepath = args.csv_header classify_overwrite = config['utility']['classify_overwrite'] p_use_multiprocessing = \ config['utility']['parallel']['use_multiprocessing'] p_index_extraction_cores = \ config['utility']['parallel']['index_extraction_cores'] p_classification_cores = \ config['utility']['parallel']['classification_cores'] if not uuids_list_filepath: raise ValueError("No uuids_list_filepath specified.") elif not os.path.isfile(uuids_list_filepath): raise ValueError("Given uuids_list_filepath did not point to a file.") if output_csv_header_filepath is None: raise ValueError("Need a path to save CSV header labels") if output_csv_filepath is None: raise ValueError("Need a path to save CSV data.") # # Initialize configured plugins # log.info("Initializing descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls() ) log.info("Initializing classification factory") c_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory'] ) log.info("Initializing classifier") #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config( config['plugins']['classifier'], get_classifier_impls() ) # # Setup/Process # def iter_uuids(): with open(uuids_list_filepath) as f: for l in f: yield l.strip() def descr_for_uuid(uuid): """ :type uuid: collections.Hashable :rtype: smqtk.representation.DescriptorElement """ return descriptor_index.get_descriptor(uuid) def classify_descr(d): """ :type d: smqtk.representation.DescriptorElement :rtype: smqtk.representation.ClassificationElement """ return classifier.classify(d, c_factory, classify_overwrite) log.info("Initializing uuid-to-descriptor parallel map") #: :type: collections.Iterable[smqtk.representation.DescriptorElement] element_iter = parallel.parallel_map( descr_for_uuid, iter_uuids(), use_multiprocessing=p_use_multiprocessing, cores=p_index_extraction_cores, name="descr_for_uuid", ) log.info("Initializing descriptor-to-classification parallel map") #: :type: collections.Iterable[smqtk.representation.ClassificationElement] classification_iter = parallel.parallel_map( classify_descr, element_iter, use_multiprocessing=p_use_multiprocessing, cores=p_classification_cores, name='classify_descr', ) # # Write/Output files # c_labels = classifier.get_labels() def make_row(c): """ :type c: smqtk.representation.ClassificationElement """ c_m = c.get_classification() return [c.uuid] + [c_m[l] for l in c_labels] # column labels file log.info("Writing CSV column header file: %s", output_csv_header_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath)) with open(output_csv_header_filepath, 'wb') as f_csv: w = csv.writer(f_csv) w.writerow(['uuid'] + c_labels) # CSV file log.info("Writing CSV data file: %s", output_csv_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_filepath)) r_state = [0] * 7 with open(output_csv_filepath, 'wb') as f_csv: w = csv.writer(f_csv) for c in classification_iter: w.writerow(make_row(c)) bin_utils.report_progress(log.info, r_state, 1.0) # Final report r_state[1] -= 1 bin_utils.report_progress(log.info, r_state, 0) log.info("Done")
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # # Initialize stuff from configuration # #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config( config['plugins']['classifier'], get_classifier_impls() ) #: :type: ClassificationElementFactory classification_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory'] ) #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls() ) uuid2label_filepath = config['utility']['csv_filepath'] do_train = config['utility']['train'] output_uuid_cm = config['utility']['output_uuid_confusion_matrix'] plot_filepath_pr = config['utility']['output_plot_pr'] plot_filepath_roc = config['utility']['output_plot_roc'] plot_filepath_cm = config['utility']['output_plot_confusion_matrix'] plot_ci = config['utility']['curve_confidence_interval'] plot_ci_alpha = config['utility']['curve_confidence_interval_alpha'] # # Construct mapping of label to the DescriptorElement instances for that # described by that label. # log.info("Loading descriptors by UUID") def iter_uuid_label(): """ Iterate through UUIDs in specified file """ with open(uuid2label_filepath) as uuid2label_file: reader = csv.reader(uuid2label_file) for r in reader: # TODO: This will need to be updated to handle multiple labels # per descriptor. yield r[0], r[1] def get_descr(r): """ Fetch descriptors from configured index """ uuid, truth_label = r return truth_label, descriptor_index.get_descriptor(uuid) tlabel_element_iter = parallel.parallel_map( get_descr, iter_uuid_label(), name="cmv_get_descriptors", use_multiprocessing=True, cores=config['parallelism']['descriptor_fetch_cores'], ) # Map of truth labels to descriptors of labeled data #: :type: dict[str, list[smqtk.representation.DescriptorElement]] tlabel2descriptors = {} for tlabel, d in tlabel_element_iter: tlabel2descriptors.setdefault(tlabel, []).append(d) # Train classifier if the one given has a ``train`` method and training # was turned enabled. if do_train: if isinstance(classifier, SupervisedClassifier): log.info("Training classifier model") classifier.train(tlabel2descriptors) exit(0) else: ValueError("Configured classifier is not a SupervisedClassifier " "type and does not support training.") # # Apply classifier to descriptors for predictions # # Truth label to predicted classification results #: :type: dict[str, set[smqtk.representation.ClassificationElement]] tlabel2classifications = {} for tlabel, descriptors in six.iteritems(tlabel2descriptors): tlabel2classifications[tlabel] = \ set(classifier.classify_async( descriptors, classification_factory, use_multiprocessing=True, procs=config['parallelism']['classification_cores'], ri=1.0, ).values()) log.info("Truth label counts:") for l in sorted(tlabel2classifications): log.info(" %s :: %d", l, len(tlabel2classifications[l])) # # Confusion Matrix # conf_mat, labels = gen_confusion_matrix(tlabel2classifications) log.info("Confusion_matrix") log_cm(log.info, conf_mat, labels) if plot_filepath_cm: plot_cm(conf_mat, labels, plot_filepath_cm) # Confusion Matrix of descriptor UUIDs to output json if output_uuid_cm: # Top dictionary keys are true labels, inner dictionary keys are UUID # predicted labels. log.info("Computing UUID Confusion Matrix") #: :type: dict[str, dict[collections.Hashable, set]] uuid_cm = {} for tlabel in tlabel2classifications: uuid_cm[tlabel] = collections.defaultdict(set) for c in tlabel2classifications[tlabel]: uuid_cm[tlabel][c.max_label()].add(c.uuid) # convert sets to lists for plabel in uuid_cm[tlabel]: uuid_cm[tlabel][plabel] = list(uuid_cm[tlabel][plabel]) with open(output_uuid_cm, 'w') as f: log.info("Saving UUID Confusion Matrix: %s", output_uuid_cm) json.dump(uuid_cm, f, indent=2, separators=(',', ': ')) # # Create PR/ROC curves via scikit learn tools # if plot_filepath_pr: log.info("Making PR curve") make_pr_curves(tlabel2classifications, plot_filepath_pr, plot_ci, plot_ci_alpha) if plot_filepath_roc: log.info("Making ROC curve") make_roc_curves(tlabel2classifications, plot_filepath_roc, plot_ci, plot_ci_alpha)
def mb_kmeans_build_apply(index, mbkm, initial_fit_size): """ Build the MiniBatchKMeans centroids based on the descriptors in the given index, then predicting descriptor clusters with the final result model. If the given index is empty, no fitting or clustering occurs and an empty dictionary is returned. :param index: Index of descriptors :type index: smqtk.representation.DescriptorIndex :param mbkm: Scikit-Learn MiniBatchKMeans instead to train and then use for prediction :type mbkm: sklearn.cluster.MiniBatchKMeans :param initial_fit_size: Number of descriptors to run an initial fit with. This brings the advantage of choosing a best initialization point from multiple. :type initial_fit_size: int :return: Dictionary of the cluster label (integer) to the set of descriptor UUIDs belonging to that cluster. :rtype: dict[int, set[collections.Hashable]] """ log = logging.getLogger(__name__) ifit_completed = False k_deque = collections.deque() d_fitted = 0 log.info("Getting index keys (shuffled)") index_keys = sorted(six.iterkeys(index)) numpy.random.seed(mbkm.random_state) numpy.random.shuffle(index_keys) def parallel_iter_vectors(descriptors): """ Get the vectors for the descriptors given. Not caring about order returned. """ return parallel.parallel_map(lambda d: d.vector(), descriptors, use_multiprocessing=False) def get_vectors(k_iter): """ Get numpy array of descriptor vectors (2D array returned) """ return numpy.array(list( parallel_iter_vectors(index.get_many_descriptors(k_iter)) )) log.info("Collecting iteratively fitting model") pr = cli.ProgressReporter(log.debug, 1.0).start() for i, k in enumerate(index_keys): k_deque.append(k) pr.increment_report() if initial_fit_size and not ifit_completed: if len(k_deque) == initial_fit_size: log.info("Initial fit using %d descriptors", len(k_deque)) log.info("- collecting vectors") vectors = get_vectors(k_deque) log.info("- fitting model") mbkm.fit(vectors) log.info("- cleaning") d_fitted += len(vectors) k_deque.clear() ifit_completed = True elif len(k_deque) == mbkm.batch_size: log.info("Partial fit with batch size %d", len(k_deque)) log.info("- collecting vectors") vectors = get_vectors(k_deque) log.info("- fitting model") mbkm.partial_fit(vectors) log.info("- cleaning") d_fitted += len(k_deque) k_deque.clear() pr.report() # Final fit with any remaining descriptors if k_deque: log.info("Final partial fit of size %d", len(k_deque)) log.info('- collecting vectors') vectors = get_vectors(k_deque) log.info('- fitting model') mbkm.partial_fit(vectors) log.info('- cleaning') d_fitted += len(k_deque) k_deque.clear() log.info("Computing descriptor classes with final KMeans model") mbkm.verbose = False d_classes = collections.defaultdict(set) d_uv_iter = parallel.parallel_map(lambda d: (d.uuid(), d.vector()), index, use_multiprocessing=False, name="uv-collector") # TODO: Batch predict call inputs to something larger than one at a time. d_uc_iter = parallel.parallel_map( lambda u_v: (u_v[0], mbkm.predict(u_v[1][numpy.newaxis, :])[0]), d_uv_iter, use_multiprocessing=False, name="uc-collector") pr = cli.ProgressReporter(log.debug, 1.0).start() for uuid, c in d_uc_iter: d_classes[c].add(uuid) pr.increment_report() pr.report() return d_classes
def compute_hash_codes(uuids, index, functor, report_interval=1.0, use_mp=False, ordered=False): """ Given an iterable of DescriptorElement UUIDs, asynchronously access them from the given ``index``, asynchronously compute hash codes via ``functor`` and convert to an integer, yielding (UUID, hash-int) pairs. :param uuids: Sequence of UUIDs to process :type uuids: collections.Iterable[collections.Hashable] :param index: Descriptor index to pull from. :type index: smqtk.representation.descriptor_index.DescriptorIndex :param functor: LSH hash code functor instance :type functor: smqtk.algorithms.LshFunctor :param report_interval: Frequency in seconds at which we report speed and completion progress via logging. Reporting is disabled when logging is not in debug and this value is greater than 0. :type report_interval: float :param use_mp: If multiprocessing should be used for parallel computation vs. threading. Reminder: This will copy currently loaded objects onto worker processes (e.g. the given index), which could lead to dangerously high RAM consumption. :type use_mp: bool :param ordered: If the element-hash value pairs yielded are in the same order as element UUID values input. This function should be slightly faster when ordering is not required. :type ordered: bool :return: Generator instance yielding (DescriptorElement, int) value pairs. """ # TODO: parallel map fetch elements from index? # -> separately from compute def get_hash(u): v = index.get_descriptor(u).vector() return u, bit_utils.bit_vector_to_int_large(functor.get_hash(v)) # Setup log and reporting function log = logging.getLogger(__name__) if log.getEffectiveLevel() > logging.DEBUG or report_interval <= 0: def log_func(*_, **__): return log.debug("Not logging progress") else: log.debug("Logging progress at %f second intervals", report_interval) log_func = log.debug log.debug("Starting computation") reporter = bin_utils.ProgressReporter(log_func, report_interval) reporter.start() for uuid, hash_int in parallel.parallel_map(get_hash, uuids, ordered=ordered, use_multiprocessing=use_mp): yield (uuid, hash_int) # Progress reporting reporter.increment_report() # Final report reporter.report()
def compute_hash_codes(uuids, index, functor, hash2uuids=None, report_interval=1.0, use_mp=False): """ Given an iterable of DescriptorElement UUIDs, asynchronously access them from the given ``index``, asynchronously compute hash codes via ``functor`` and convert to an integer, yielding (DescriptorElement, hash-int) pairs. The dictionary input and returned is of the same format used by the ``LSHNearestNeighborIndex`` implementation (mapping pointed to by the ``hash2uuid_cache_filepath`` attribute). :param uuids: Sequence of UUIDs to process :type uuids: collections.Iterable[collections.Hashable] :param index: Descriptor index to pull from. :type index: smqtk.representation.descriptor_index.DescriptorIndex :param functor: LSH hash code functor instance :type functor: smqtk.algorithms.LshFunctor :param hash2uuids: Hash code to UUID set to update, which is also returned from this function. If not provided, we will start a new mapping, which is returned instead. :type hash2uuids: dict[int|long, set[collections.Hashable]] :param report_interval: Frequency in seconds at which we report speed and completion progress via logging. Reporting is disabled when logging is not in debug and this value is greater than 0. :type report_interval: float :param use_mp: If multiprocessing should be used for parallel computation vs. threading. Reminder: This will copy currently loaded objects onto worker processes (e.g. the given index), which could lead to dangerously high RAM consumption. :type use_mp: bool :return: The ``update_map`` provided or, if None was provided, a new mapping. :rtype: dict[int|long, set[collections.Hashable]] """ if hash2uuids is None: hash2uuids = {} # TODO: parallel map fetch elements from index? # -> separately from compute def get_hash(u): v = index.get_descriptor(u).vector() return u, bit_utils.bit_vector_to_int_large(functor.get_hash(v)) # Setup log and reporting function log = logging.getLogger(__name__) report_state = [0] * 7 # noinspection PyGlobalUndefined if log.getEffectiveLevel() > logging.DEBUG or report_interval <= 0: def report_progress(*_): return log.debug("Not logging progress") else: log.debug("Logging progress at %f second intervals", report_interval) report_progress = bin_utils.report_progress log.debug("Starting computation") for uuid, hash_int in parallel.parallel_map(get_hash, uuids, ordered=False, use_multiprocessing=use_mp): if hash_int not in hash2uuids: hash2uuids[hash_int] = set() hash2uuids[hash_int].add(uuid) # Progress reporting report_progress(log.debug, report_state, report_interval) # Final report report_state[1] -= 1 report_progress(log.debug, report_state, 0.0) return hash2uuids
def parallel_iter_vectors(descriptors): """ Get the vectors for the descriptors given. Not caring about order returned. """ return parallel.parallel_map(lambda d: d.vector(), descriptors, use_multiprocessing=False)
def compute_hash_codes(uuids, index, functor, hash2uuids=None, report_interval=1.0, use_mp=False): """ Given an iterable of DescriptorElement UUIDs, asynchronously access them from the given ``index``, asynchronously compute hash codes via ``functor`` and convert to an integer, yielding (DescriptorElement, hash-int) pairs. The dictionary input and returned is of the same format used by the ``LSHNearestNeighborIndex`` implementation (mapping pointed to by the ``hash2uuid_cache_filepath`` attribute). :param uuids: Sequence of UUIDs to process :type uuids: collections.Iterable[collections.Hashable] :param index: Descriptor index to pull from. :type index: smqtk.representation.descriptor_index.DescriptorIndex :param functor: LSH hash code functor instance :type functor: smqtk.algorithms.LshFunctor :param hash2uuids: Hash code to UUID set to update, which is also returned from this function. If not provided, we will start a new mapping, which is returned instead. :type hash2uuids: dict[int|long, set[collections.Hashable]] :param report_interval: Frequency in seconds at which we report speed and completion progress via logging. Reporting is disabled when logging is not in debug and this value is greater than 0. :type report_interval: float :param use_mp: If multiprocessing should be used for parallel computation vs. threading. Reminder: This will copy currently loaded objects onto worker processes (e.g. the given index), which could lead to dangerously high RAM consumption. :type use_mp: bool :return: The ``update_map`` provided or, if None was provided, a new mapping. :rtype: dict[int|long, set[collections.Hashable]] """ if hash2uuids is None: hash2uuids = {} # TODO: parallel map fetch elements from index? # -> separately from compute def get_hash(u): v = index.get_descriptor(u).vector() return u, bit_utils.bit_vector_to_int_large(functor.get_hash(v)) # Setup log and reporting function log = logging.getLogger(__name__) report_state = [0] * 7 # noinspection PyGlobalUndefined if log.getEffectiveLevel() > logging.DEBUG or report_interval <= 0: def report_progress(*_): return log.debug("Not logging progress") else: log.debug("Logging progress at %f second intervals", report_interval) report_progress = bin_utils.report_progress log.debug("Starting computation") for uuid, hash_int in parallel.parallel_map(get_hash, uuids, ordered=False, use_multiprocessing=use_mp): if hash_int not in hash2uuids: hash2uuids[hash_int] = set() hash2uuids[hash_int].add(uuid) # Progress reporting report_progress(log.debug, report_state, report_interval) # Final report report_state[1] -= 1 report_progress(log.debug, report_state, 0.0) return hash2uuids
log.info("Loading resource files") with open(ad_image_csv) as f: url_ad_label_rows = list(csv.reader(f)) with open(ad_phone_csv) as f: ad2phone = dict(csv.reader(f)) # Download unique img_urls, get filepaths + SHA1 checksum url_set = set(r[0] for r in url_ad_label_rows) print "%d unique URLs" % len(url_set) # URL to (filepath, sha1sum) tuple #: :type: dict[str, (str, str)] url2fs = {} for url, save_pth, sha1 in parallel_map(dl_ad_image, url_set, itertools.repeat(image_output_dir), name='image_downloader', use_multiprocessing=True, # cores=32, cores=128, ): if url: url2fs[url] = (save_pth, sha1) log.info("Downloaded %d images", len(url2fs)) log.info("Forming relational mappings") # save mapping of SHA1 to filepath # save mapping of CDR-ID to set of child image SHA1s # save mapping of CDR-ID to label #: :type: dict[str, str] sha2path = {} #: :type: dict[str, set[str]] ad2shas = collections.defaultdict(set)
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # # Initialize stuff from configuration # #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config(config['plugins']['classifier'], get_classifier_impls()) #: :type: ClassificationElementFactory classification_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory']) #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls()) uuid2label_filepath = config['utility']['csv_filepath'] do_train = config['utility']['train'] output_uuid_cm = config['utility']['output_uuid_confusion_matrix'] plot_filepath_pr = config['utility']['output_plot_pr'] plot_filepath_roc = config['utility']['output_plot_roc'] plot_filepath_cm = config['utility']['output_plot_confusion_matrix'] plot_ci = config['utility']['curve_confidence_interval'] plot_ci_alpha = config['utility']['curve_confidence_interval_alpha'] # # Construct mapping of label to the DescriptorElement instances for that # described by that label. # log.info("Loading descriptors by UUID") def iter_uuid_label(): """ Iterate through UUIDs in specified file """ with open(uuid2label_filepath) as uuid2label_file: reader = csv.reader(uuid2label_file) for r in reader: # TODO: This will need to be updated to handle multiple labels # per descriptor. yield r[0], r[1] def get_descr(r): """ Fetch descriptors from configured index """ uuid, truth_label = r return truth_label, descriptor_index.get_descriptor(uuid) tlabel_element_iter = parallel.parallel_map( get_descr, iter_uuid_label(), name="cmv_get_descriptors", use_multiprocessing=True, cores=config['parallelism']['descriptor_fetch_cores'], ) # Map of truth labels to descriptors of labeled data #: :type: dict[str, list[smqtk.representation.DescriptorElement]] tlabel2descriptors = {} for tlabel, d in tlabel_element_iter: tlabel2descriptors.setdefault(tlabel, []).append(d) # Train classifier if the one given has a ``train`` method and training # was turned enabled. if do_train: if isinstance(classifier, SupervisedClassifier): log.info("Training classifier model") classifier.train(tlabel2descriptors) exit(0) else: ValueError("Configured classifier is not a SupervisedClassifier " "type and does not support training.") # # Apply classifier to descriptors for predictions # # Truth label to predicted classification results #: :type: dict[str, set[smqtk.representation.ClassificationElement]] tlabel2classifications = {} for tlabel, descriptors in tlabel2descriptors.items(): tlabel2classifications[tlabel] = \ set(classifier.classify_async( descriptors, classification_factory, use_multiprocessing=True, procs=config['parallelism']['classification_cores'], ri=1.0, ).values()) log.info("Truth label counts:") for l in sorted(tlabel2classifications): log.info(" %s :: %d", l, len(tlabel2classifications[l])) # # Confusion Matrix # conf_mat, labels = gen_confusion_matrix(tlabel2classifications) log.info("Confusion_matrix") log_cm(log.info, conf_mat, labels) if plot_filepath_cm: plot_cm(conf_mat, labels, plot_filepath_cm) # CM of descriptor UUIDs to output json if output_uuid_cm: # Top dictionary keys are true labels, inner dictionary keys are UUID # predicted labels. log.info("Computing UUID Confusion Matrix") #: :type: dict[str, dict[str, set | list]] uuid_cm = {} for tlabel in tlabel2classifications: uuid_cm[tlabel] = collections.defaultdict(set) for c in tlabel2classifications[tlabel]: uuid_cm[tlabel][c.max_label()].add(c.uuid) # convert sets to lists for plabel in uuid_cm[tlabel]: uuid_cm[tlabel][plabel] = list(uuid_cm[tlabel][plabel]) with open(output_uuid_cm, 'w') as f: log.info("Saving UUID Confusion Matrix: %s", output_uuid_cm) json.dump(uuid_cm, f, indent=2, separators=(',', ': ')) # # Create PR/ROC curves via scikit learn tools # if plot_filepath_pr: log.info("Making PR curve") make_pr_curves(tlabel2classifications, plot_filepath_pr, plot_ci, plot_ci_alpha) if plot_filepath_roc: log.info("Making ROC curve") make_roc_curves(tlabel2classifications, plot_filepath_roc, plot_ci, plot_ci_alpha)
def classify_async(self, d_iter, factory, overwrite=False, procs=None, use_multiprocessing=False, ri=None): """ Asynchronously classify the DescriptorElements in the given iterable. :param d_iter: Iterable of DescriptorElements :type d_iter: collections.Iterable[smqtk.representation.DescriptorElement] :param factory: Classifier element factory to use for element generation. :type factory: smqtk.representation.ClassificationElementFactory :param overwrite: Recompute classification of the input descriptor and set the results to the ClassificationElement produced by the factory. :type overwrite: bool :param procs: Explicit number of cores/thread/processes to use. :type procs: None | int :param use_multiprocessing: Use ``multiprocessing.pool.Pool`` instead of ``multiprocessing.pool.ThreadPool``. :type use_multiprocessing: bool :param ri: Progress reporting interval in seconds. Set to a value > 0 to enable. Disabled by default. :type ri: float | None :return: Mapping of input DescriptorElement instances to the computed ClassificationElement. ClassificationElement UUID's are congruent with the UUID of the DescriptorElement :rtype: dict[smqtk.representation.DescriptorElement, smqtk.representation.ClassificationElement] """ self._log.debug("Async classifying descriptors") ri = ri and ri > 0 and ri def work(d_elem): return d_elem, self.classify(d_elem, factory, overwrite) classifications = parallel.parallel_map( work, d_iter, cores=procs, ordered=False, use_multiprocessing=use_multiprocessing, ) r_state = [0] * 7 if ri: r_progress = bin_utils.report_progress else: def r_progress(*_): return d2c_map = {} for d, c in classifications: d2c_map[d] = c r_progress(self._log.debug, r_state, ri) return d2c_map
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # - parallel_map UUIDs to load from the configured index # - classify iterated descriptors uuids_list_filepath = args.uuids_list output_csv_filepath = args.csv_data output_csv_header_filepath = args.csv_header classify_overwrite = config['utility']['classify_overwrite'] p_use_multiprocessing = \ config['utility']['parallel']['use_multiprocessing'] p_index_extraction_cores = \ config['utility']['parallel']['index_extraction_cores'] p_classification_cores = \ config['utility']['parallel']['classification_cores'] if not uuids_list_filepath: raise ValueError("No uuids_list_filepath specified.") elif not os.path.isfile(uuids_list_filepath): raise ValueError("Given uuids_list_filepath did not point to a file.") if output_csv_header_filepath is None: raise ValueError("Need a path to save CSV header labels") if output_csv_filepath is None: raise ValueError("Need a path to save CSV data.") # # Initialize configured plugins # log.info("Initializing descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls()) log.info("Initializing classification factory") c_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory']) log.info("Initializing classifier") #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config(config['plugins']['classifier'], get_classifier_impls()) # # Setup/Process # def iter_uuids(): with open(uuids_list_filepath) as f: for l in f: yield l.strip() def descr_for_uuid(uuid): """ :type uuid: collections.Hashable :rtype: smqtk.representation.DescriptorElement """ return descriptor_index.get_descriptor(uuid) def classify_descr(d): """ :type d: smqtk.representation.DescriptorElement :rtype: smqtk.representation.ClassificationElement """ return classifier.classify(d, c_factory, classify_overwrite) log.info("Initializing uuid-to-descriptor parallel map") #: :type: collections.Iterable[smqtk.representation.DescriptorElement] element_iter = parallel.parallel_map( descr_for_uuid, iter_uuids(), use_multiprocessing=p_use_multiprocessing, cores=p_index_extraction_cores, name="descr_for_uuid", ) log.info("Initializing descriptor-to-classification parallel map") #: :type: collections.Iterable[smqtk.representation.ClassificationElement] classification_iter = parallel.parallel_map( classify_descr, element_iter, use_multiprocessing=p_use_multiprocessing, cores=p_classification_cores, name='classify_descr', ) # # Write/Output files # c_labels = classifier.get_labels() def make_row(c): """ :type c: smqtk.representation.ClassificationElement """ c_m = c.get_classification() return [c.uuid] + [c_m[l] for l in c_labels] # column labels file log.info("Writing CSV column header file: %s", output_csv_header_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath)) with open(output_csv_header_filepath, 'wb') as f_csv: w = csv.writer(f_csv) w.writerow(['uuid'] + c_labels) # CSV file log.info("Writing CSV data file: %s", output_csv_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_filepath)) r_state = [0] * 7 with open(output_csv_filepath, 'wb') as f_csv: w = csv.writer(f_csv) for c in classification_iter: w.writerow(make_row(c)) bin_utils.report_progress(log.info, r_state, 1.0) # Final report r_state[1] -= 1 bin_utils.report_progress(log.info, r_state, 0) log.info("Done")
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # - parallel_map UUIDs to load from the configured index # - classify iterated descriptors uuids_list_filepath = args.uuids_list output_csv_filepath = args.csv_data output_csv_header_filepath = args.csv_header classify_overwrite = config['utility']['classify_overwrite'] p_use_multiprocessing = \ config['utility']['parallel']['use_multiprocessing'] p_index_extraction_cores = \ config['utility']['parallel']['index_extraction_cores'] p_classification_cores = \ config['utility']['parallel']['classification_cores'] if not uuids_list_filepath: raise ValueError("No uuids_list_filepath specified.") elif not os.path.isfile(uuids_list_filepath): raise ValueError("Given uuids_list_filepath did not point to a file.") if output_csv_header_filepath is None: raise ValueError("Need a path to save CSV header labels") if output_csv_filepath is None: raise ValueError("Need a path to save CSV data.") # # Initialize configured plugins # log.info("Initializing descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls() ) log.info("Initializing classification factory") c_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory'] ) log.info("Initializing classifier") #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config( config['plugins']['classifier'], get_classifier_impls() ) # # Setup/Process # def iter_uuids(): with open(uuids_list_filepath) as f: for l in f: yield l.strip() def descr_for_uuid(uuid): """ :type uuid: collections.Hashable :rtype: smqtk.representation.DescriptorElement """ return descriptor_index.get_descriptor(uuid) def classify_descr(d): """ :type d: smqtk.representation.DescriptorElement :rtype: smqtk.representation.ClassificationElement """ return classifier.classify(d, c_factory, classify_overwrite) log.info("Initializing uuid-to-descriptor parallel map") #: :type: collections.Iterable[smqtk.representation.DescriptorElement] element_iter = parallel.parallel_map( descr_for_uuid, iter_uuids(), use_multiprocessing=p_use_multiprocessing, cores=p_index_extraction_cores, name="descr_for_uuid", ) log.info("Initializing descriptor-to-classification parallel map") #: :type: collections.Iterable[smqtk.representation.ClassificationElement] classification_iter = parallel.parallel_map( classify_descr, element_iter, use_multiprocessing=p_use_multiprocessing, cores=p_classification_cores, name='classify_descr', ) # # Write/Output files # c_labels = classifier.get_labels() def make_row(e): """ :type e: smqtk.representation.ClassificationElement """ c_m = e.get_classification() return [e.uuid] + [c_m[l] for l in c_labels] # column labels file log.info("Writing CSV column header file: %s", output_csv_header_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath)) with open(output_csv_header_filepath, 'wb') as f_csv: w = csv.writer(f_csv) w.writerow(['uuid'] + [str(cl) for cl in c_labels]) # CSV file log.info("Writing CSV data file: %s", output_csv_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_filepath)) r_state = [0] * 7 with open(output_csv_filepath, 'wb') as f_csv: w = csv.writer(f_csv) for c in classification_iter: w.writerow(make_row(c)) bin_utils.report_progress(log.info, r_state, 1.0) # Final report r_state[1] -= 1 bin_utils.report_progress(log.info, r_state, 0) log.info("Done")
while i < total: b_start = i b_end = i + batch_size for h in q[b_start:b_end].execute(): # noinspection PyProtectedMember yield h.meta._d_ i += 1 except elasticsearch.ConnectionTimeout, ex: log.warning("ElasticSearch timed out (error = %s)", str(ex)) restart = True log.debug("Restarting query from index %d", i) log.info("Initializing image download/record parallel iterator") img_dl_records = parallel_map(dl_image, iter_scan_meta(), name='image_download', use_multiprocessing=True, cores=cores) # Write out log.info("Starting iteration/file-write") rp_state = [0] * 7 with open(scan_record, 'w') as record_file: for r in img_dl_records: if r is not None: cdr_id, local_path, uuid = r record_file.write('%s,%s,%s\n' % (cdr_id, local_path, uuid)) report_progress(log.debug, rp_state, 1.0) # Final report rp_state[1] -= 1 report_progress(log.debug, rp_state, 0)
def compute_distance_kernel(m, dist_func, row_wise=False, parallel=True): """ Method for computing the distance kernel of an array of vectors given a distance function that works on two supplied 1D arrays. For a valid distance function interface, see ``smqtk.utils.distance_functions.histogram_intersection_distance2``. :param m: An array of vectors to compute the pairwise distance kernel for. :type m: numpy.core.multiarray.ndarray :param dist_func: Distance function :type dist_func: (ndarray, ndarray) -> ndarray[float] | float :param row_wise: If the given distance function can take a vector and a matrix, and computes pair-wise distances, returning a vector of distances between the given vector and each row of the matrix. :type row_wise: bool :param parallel: If distances should be calculated in parallel. This is true by default. :type parallel: bool :return: Computed symmetric distance kernel :rtype: numpy.core.multiarray.ndarray """ if hasattr(dist_func, 'im_func'): # noinspection PyUnresolvedReferences distance_name = '.'.join([ dist_func.__module__, dist_func.im_class.__name__, dist_func.im_func.func_name ]) elif hasattr(dist_func, 'func_name'): # noinspection PyUnresolvedReferences distance_name = '.'.join([dist_func.__module__, dist_func.func_name]) elif hasattr(dist_func, 'py_func') \ and hasattr(dist_func.py_func, 'func_name'): distance_name = '.'.join( [dist_func.__module__, dist_func.py_func.func_name]) else: distance_name = "<unknown>" log = logging.getLogger('compute_distance_kernel[%s]' % distance_name) if m.ndim == 1: m = m[np.newaxis] log.info("Computing distance kernel") side = m.shape[0] mat = np.ndarray((side, side), dtype=float) if row_wise: log.debug("Computing row-wise distances") # For all rows except the last one. We'll have computed all distanced by # the time reach m[side-1] if parallel: def work_func(i): mat[i, i] = 0. if i < (side - 1): mat[i + 1:, i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :]) # Using threading for in-place modification s = [0] * 7 for _ in parallel_map(work_func, xrange(side), use_multiprocessing=False): report_progress(log.debug, s, 1.) else: for i in xrange(side): # Compute col/row wise distances mat[i, i] = 0. if i < (side - 1): mat[i + 1:, i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :]) else: log.debug("Computing element-wise distances") if parallel: def work_func(i): mat[i, i] = 0 # cols to the left of diagonal index for this row for j in xrange(i): mat[i, j] = mat[j, i] = dist_func(m[i], m[j]) # Using threading for in-place modification s = [0] * 7 for _ in parallel_map(work_func, xrange(side), use_multiprocessing=False): report_progress(log.debug, s, 1.) else: for i in xrange(side): mat[i, i] = 0 # cols to the left of diagonal index for this row for j in xrange(i): mat[i, j] = mat[j, i] = dist_func(m[i], m[j]) return mat
def compute_distance_kernel(m, dist_func, row_wise=False, parallel=True): """ Method for computing the distance kernel of an array of vectors given a distance function that works on two supplied 1D arrays. For a valid distance function interface, see ``smqtk.utils.distance_functions.histogram_intersection_distance2``. :param m: An array of vectors to compute the pairwise distance kernel for. :type m: numpy.ndarray :param dist_func: Distance function :type dist_func: (ndarray, ndarray) -> ndarray[float] | float :param row_wise: If the given distance function can take a vector and a matrix, and computes pair-wise distances, returning a vector of distances between the given vector and each row of the matrix. :type row_wise: bool :param parallel: If distances should be calculated in parallel. This is true by default. :type parallel: bool :return: Computed symmetric distance kernel :rtype: numpy.ndarray """ log = logging.getLogger(__name__) if m.ndim == 1: m = m[np.newaxis] log.info("Computing distance kernel") side = m.shape[0] mat = np.ndarray((side, side), dtype=float) s = [0] * 7 if row_wise: log.debug("Computing row-wise distances") # For all rows except the last one. We'll have computed all distances by # the time reach m[side-1] if parallel: # noinspection PyShadowingNames def work_func(i): mat[i, i] = dist_func(m[i], m[i]) if i < (side - 1): mat[i + 1:, i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :]) # Using threading for in-place modification s = [0] * 7 for _ in parallel_map(work_func, range(side), use_multiprocessing=False): report_progress(log.debug, s, 1.) else: for i in range(side): # Compute col/row wise distances mat[i, i] = dist_func(m[i], m[i]) if i < (side-1): mat[i+1:, i] = mat[i, i+1:] = dist_func(m[i, :], m[i+1:, :]) report_progress(log.debug, s, 1.) else: log.debug("Computing element-wise distances") if parallel: # noinspection PyShadowingNames def work_func(i): mat[i, i] = dist_func(m[i], m[i]) # cols to the left of diagonal index for this row for j in range(i): mat[i, j] = mat[j, i] = dist_func(m[i], m[j]) # Using threading for in-place modification for _ in parallel_map(work_func, range(side), use_multiprocessing=False): report_progress(log.debug, s, 1.) else: for i in range(side): mat[i, i] = dist_func(m[i], m[i]) # cols to the left of diagonal index for this row for j in range(i): mat[i, j] = mat[j, i] = dist_func(m[i], m[j]) report_progress(log.debug, s, 1.) return mat
def classify_async(self, d_iter, factory, overwrite=False, procs=None, use_multiprocessing=False, ri=None): """ Asynchronously classify the DescriptorElements in the given iterable. :param d_iter: Iterable of DescriptorElements :type d_iter: collections.Iterable[smqtk.representation.DescriptorElement] :param factory: Classifier element factory to use for element generation. :type factory: smqtk.representation.ClassificationElementFactory :param overwrite: Recompute classification of the input descriptor and set the results to the ClassificationElement produced by the factory. :type overwrite: bool :param procs: Explicit number of cores/thread/processes to use. :type procs: None | int :param use_multiprocessing: Use ``multiprocessing.pool.Pool`` instead of ``multiprocessing.pool.ThreadPool``. :type use_multiprocessing: bool :param ri: Progress reporting interval in seconds. Set to a value > 0 to enable. Disabled by default. :type ri: float | None :return: Mapping of input DescriptorElement instances to the computed ClassificationElement. ClassificationElement UUID's are congruent with the UUID of the DescriptorElement :rtype: dict[smqtk.representation.DescriptorElement, smqtk.representation.ClassificationElement] """ self._log.debug("Async classifying descriptors") ri = ri and ri > 0 and ri def work(d_elem): return d_elem, self.classify(d_elem, factory, overwrite) classifications = parallel.parallel_map( work, d_iter, cores=procs, ordered=False, use_multiprocessing=use_multiprocessing, ) r_state = [0] * 7 if ri: r_progress = bin_utils.report_progress else: def r_progress(*_): return d2c_map = {} for d, c in classifications: d2c_map[d] = c r_progress(self._log.debug, r_state, ri) return d2c_map
def test_simple_unordered_threaded(self): r = list(parallel_map(self.test_func, self.test_string, ordered=False, use_multiprocessing=False)) nose.tools.assert_equal(set(r), set(self.expected))
def main(): description = """ Utility for validating a given classifier implementation's model against some labeled testing data, outputting PR and ROC curve plots with area-under-curve score values. This utility can optionally be used train a supervised classifier model if the given classifier model configuration does not exist and a second CSV file listing labeled training data is provided. Training will be attempted if ``train`` is set to true. If training is performed, we exit after training completes. A ``SupervisedClassifier`` sub-classing implementation must be configured We expect the test and train CSV files in the column format: ... <UUID>,<label> ... The UUID is of the descriptor to which the label applies. The label may be any arbitrary string value, but all labels must be consistent in application. Some metrics presented assume the highest confidence class as the single predicted class for an element: - confusion matrix The output UUID confusion matrix is a JSON dictionary where the top-level keys are the true labels, and the inner dictionary is the mapping of predicted labels to the UUIDs of the classifications/descriptors that yielded the prediction. Again, this is based on the maximum probability label for a classification result (T=0.5). """ args, config = bin_utils.utility_main_helper(default_config, description) log = logging.getLogger(__name__) # # Initialize stuff from configuration # #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config( config['plugins']['classifier'], get_classifier_impls() ) #: :type: ClassificationElementFactory classification_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory'] ) #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls() ) uuid2label_filepath = config['utility']['csv_filepath'] do_train = config['utility']['train'] output_uuid_cm = config['utility']['output_uuid_confusion_matrix'] plot_filepath_pr = config['utility']['output_plot_pr'] plot_filepath_roc = config['utility']['output_plot_roc'] plot_filepath_cm = config['utility']['output_plot_confusion_matrix'] plot_ci = config['utility']['curve_confidence_interval'] plot_ci_alpha = config['utility']['curve_confidence_interval_alpha'] # # Construct mapping of label to the DescriptorElement instances for that # described by that label. # log.info("Loading descriptors by UUID") def iter_uuid_label(): """ Iterate through UUIDs in specified file """ with open(uuid2label_filepath) as uuid2label_file: reader = csv.reader(uuid2label_file) for r in reader: # TODO: This will need to be updated to handle multiple labels # per descriptor. yield r[0], r[1] def get_descr(r): """ Fetch descriptors from configured index """ uuid, truth_label = r return truth_label, descriptor_index.get_descriptor(uuid) tlabel_element_iter = parallel.parallel_map( get_descr, iter_uuid_label(), name="cmv_get_descriptors", use_multiprocessing=True, cores=config['parallelism']['descriptor_fetch_cores'], ) # Map of truth labels to descriptors of labeled data #: :type: dict[str, list[smqtk.representation.DescriptorElement]] tlabel2descriptors = {} for tlabel, d in tlabel_element_iter: tlabel2descriptors.setdefault(tlabel, []).append(d) # Train classifier if the one given has a ``train`` method and training # was turned enabled. if do_train: if isinstance(classifier, SupervisedClassifier): log.info("Training classifier model") classifier.train(tlabel2descriptors) exit(0) else: ValueError("Configured classifier is not a SupervisedClassifier " "type and does not support training.") # # Apply classifier to descriptors for predictions # # Truth label to predicted classification results #: :type: dict[str, set[smqtk.representation.ClassificationElement]] tlabel2classifications = {} for tlabel, descriptors in tlabel2descriptors.iteritems(): tlabel2classifications[tlabel] = \ set(classifier.classify_async( descriptors, classification_factory, use_multiprocessing=True, procs=config['parallelism']['classification_cores'], ri=1.0, ).values()) log.info("Truth label counts:") for l in sorted(tlabel2classifications): log.info(" %s :: %d", l, len(tlabel2classifications[l])) # # Confusion Matrix # conf_mat, labels = gen_confusion_matrix(tlabel2classifications) log.info("Confusion_matrix") log_cm(log.info, conf_mat, labels) if plot_filepath_cm: plot_cm(conf_mat, labels, plot_filepath_cm) # CM of descriptor UUIDs to output json if output_uuid_cm: # Top dictionary keys are true labels, inner dictionary keys are UUID # predicted labels. log.info("Computing UUID Confusion Matrix") #: :type: dict[str, dict[str, set | list]] uuid_cm = {} for tlabel in tlabel2classifications: uuid_cm[tlabel] = collections.defaultdict(set) for c in tlabel2classifications[tlabel]: uuid_cm[tlabel][c.max_label()].add(c.uuid) # convert sets to lists for plabel in uuid_cm[tlabel]: uuid_cm[tlabel][plabel] = list(uuid_cm[tlabel][plabel]) with open(output_uuid_cm, 'w') as f: log.info("Saving UUID Confusion Matrix: %s", output_uuid_cm) json.dump(uuid_cm, f, indent=2, separators=(',', ': ')) # # Create PR/ROC curves via scikit learn tools # if plot_filepath_pr: log.info("Making PR curve") make_pr_curves(tlabel2classifications, plot_filepath_pr, plot_ci, plot_ci_alpha) if plot_filepath_roc: log.info("Making ROC curve") make_roc_curves(tlabel2classifications, plot_filepath_roc, plot_ci, plot_ci_alpha)
def mb_kmeans_build_apply(index, mbkm, initial_fit_size): """ Build the MiniBatchKMeans centroids based on the descriptors in the given index, then predicting descriptor clusters with the final result model. If the given index is empty, no fitting or clustering occurs and an empty dictionary is returned. :param index: Index of descriptors :type index: smqtk.representation.DescriptorIndex :param mbkm: Scikit-Learn MiniBatchKMeans instead to train and then use for prediction :type mbkm: sklearn.cluster.MiniBatchKMeans :param initial_fit_size: Number of descriptors to run an initial fit with. This brings the advantage of choosing a best initialization point from multiple. :type initial_fit_size: int :return: Dictionary of the cluster label (integer) to the set of descriptor UUIDs belonging to that cluster. :rtype: dict[int, set[collections.Hashable]] """ log = logging.getLogger(__name__) ifit_completed = False k_deque = collections.deque() d_fitted = 0 log.info("Getting index keys (shuffled)") index_keys = sorted(six.iterkeys(index)) numpy.random.seed(mbkm.random_state) numpy.random.shuffle(index_keys) def parallel_iter_vectors(descriptors): """ Get the vectors for the descriptors given. Not caring about order returned. """ return parallel.parallel_map(lambda d: d.vector(), descriptors, use_multiprocessing=False) def get_vectors(k_iter): """ Get numpy array of descriptor vectors (2D array returned) """ return numpy.array(list( parallel_iter_vectors(index.get_many_descriptors(k_iter)) )) log.info("Collecting iteratively fitting model") rps = [0] * 7 for i, k in enumerate(index_keys): k_deque.append(k) bin_utils.report_progress(log.debug, rps, 1.) if initial_fit_size and not ifit_completed: if len(k_deque) == initial_fit_size: log.info("Initial fit using %d descriptors", len(k_deque)) log.info("- collecting vectors") vectors = get_vectors(k_deque) log.info("- fitting model") mbkm.fit(vectors) log.info("- cleaning") d_fitted += len(vectors) k_deque.clear() ifit_completed = True elif len(k_deque) == mbkm.batch_size: log.info("Partial fit with batch size %d", len(k_deque)) log.info("- collecting vectors") vectors = get_vectors(k_deque) log.info("- fitting model") mbkm.partial_fit(vectors) log.info("- cleaning") d_fitted += len(k_deque) k_deque.clear() # Final fit with any remaining descriptors if k_deque: log.info("Final partial fit of size %d", len(k_deque)) log.info('- collecting vectors') vectors = get_vectors(k_deque) log.info('- fitting model') mbkm.partial_fit(vectors) log.info('- cleaning') d_fitted += len(k_deque) k_deque.clear() log.info("Computing descriptor classes with final KMeans model") mbkm.verbose = False d_classes = collections.defaultdict(set) d_uv_iter = parallel.parallel_map(lambda d: (d.uuid(), d.vector()), index, use_multiprocessing=False, name="uv-collector") # TODO: Batch predict call inputs to something larger than one at a time. d_uc_iter = parallel.parallel_map( lambda u_v: (u_v[0], mbkm.predict(u_v[1][numpy.newaxis, :])[0]), d_uv_iter, use_multiprocessing=False, name="uc-collector") rps = [0] * 7 for uuid, c in d_uc_iter: d_classes[c].add(uuid) bin_utils.report_progress(log.debug, rps, 1.) rps[1] -= 1 bin_utils.report_progress(log.debug, rps, 0) return d_classes
def _classify_arrays(self, array_iter): if not self.has_model(): raise RuntimeError("No SVM model present for classification") assert self.svm_model is not None, ( "Should have an SVM model at this point." ) # Dump descriptors into a matrix for normalization and use in # prediction. vec_mat = numpy.array(list(array_iter)) vec_mat = self._norm_vector(vec_mat) n_jobs = self.n_jobs if n_jobs is not None: n_jobs = min(len(vec_mat), n_jobs) # Else: `n_jobs` is `None`, which is OK as it's the default value for # parallel_map. svm_label_map = self.svm_label_map c_base = dict((la, 0.) for la in svm_label_map.values()) # Effectively reproducing the body of svmutil.svm_predict in order to # simplify and get around excessive prints svm_type = self.svm_model.get_svm_type() nr_class = self.svm_model.get_nr_class() # Model internal labels. Parallel to ``prob_estimates`` array. svm_model_labels = self.svm_model.get_labels() # TODO: Normalize input arrays in batch(es). TEST if current norm # function can just take a matrix? if self.svm_model.is_probability_model(): # noinspection PyUnresolvedReferences if svm_type in [svm.NU_SVR, svm.EPSILON_SVR]: nr_class = 0 def single_pred(v): prob_estimates = (ctypes.c_double * nr_class)() v, idx = svm.gen_svm_nodearray(v.tolist()) svm.libsvm.svm_predict_probability(self.svm_model, v, prob_estimates) c = dict(c_base) # Shallow copy c.update({svm_label_map[label]: prob for label, prob in zip(svm_model_labels, prob_estimates[:nr_class])}) return c # If n_jobs == 1, just be serial if n_jobs == 1: return (single_pred(v) for v in vec_mat) else: return parallel_map(single_pred, vec_mat, cores=n_jobs, use_multiprocessing=True) else: # noinspection PyUnresolvedReferences if svm_type in (svm.ONE_CLASS, svm.EPSILON_SVR, svm.NU_SVC): nr_classifier = 1 else: nr_classifier = nr_class * (nr_class - 1) // 2 def single_label(v): dec_values = (ctypes.c_double * nr_classifier)() v, idx = svm.gen_svm_nodearray(v.tolist()) label = svm.libsvm.svm_predict_values(self.svm_model, v, dec_values) c = dict(c_base) # Shallow copy c[svm_label_map[label]] = 1. return c # If n_jobs == 1, just be serial if n_jobs == 1: return (single_label(v) for v in vec_mat) else: return parallel_map(single_label, vec_mat, cores=n_jobs, use_multiprocessing=True)
def test_simple_ordered_threaded(self): # Make sure results are still in order as requested r = list(parallel_map(self.test_func, self.test_string, ordered=True, use_multiprocessing=False)) self.assertEqual(r, self.expected)
def compute_hash_codes(uuids, index, functor, report_interval=1.0, use_mp=False, ordered=False): """ Given an iterable of DescriptorElement UUIDs, asynchronously access them from the given ``index``, asynchronously compute hash codes via ``functor`` and convert to an integer, yielding (UUID, hash-int) pairs. :param uuids: Sequence of UUIDs to process :type uuids: collections.Iterable[collections.Hashable] :param index: Descriptor index to pull from. :type index: smqtk.representation.descriptor_index.DescriptorIndex :param functor: LSH hash code functor instance :type functor: smqtk.algorithms.LshFunctor :param report_interval: Frequency in seconds at which we report speed and completion progress via logging. Reporting is disabled when logging is not in debug and this value is greater than 0. :type report_interval: float :param use_mp: If multiprocessing should be used for parallel computation vs. threading. Reminder: This will copy currently loaded objects onto worker processes (e.g. the given index), which could lead to dangerously high RAM consumption. :type use_mp: bool :param ordered: If the element-hash value pairs yielded are in the same order as element UUID values input. This function should be slightly faster when ordering is not required. :type ordered: bool :return: Generator instance yielding (DescriptorElement, int) value pairs. """ # TODO: parallel map fetch elements from index? # -> separately from compute def get_hash(u): v = index.get_descriptor(u).vector() return u, bits.bit_vector_to_int_large(functor.get_hash(v)) # Setup log and reporting function log = logging.getLogger(__name__) if log.getEffectiveLevel() > logging.DEBUG or report_interval <= 0: def log_func(*_, **__): return log.debug("Not logging progress") else: log.debug("Logging progress at %f second intervals", report_interval) log_func = log.debug log.debug("Starting computation") reporter = cli.ProgressReporter(log_func, report_interval) reporter.start() for uuid, hash_int in parallel.parallel_map(get_hash, uuids, ordered=ordered, use_multiprocessing=use_mp): yield (uuid, hash_int) # Progress reporting reporter.increment_report() # Final report reporter.report()
def test_simple_unordered_multiprocess(self): r = list(parallel_map(self.test_func, self.test_string, ordered=False, use_multiprocessing=True)) self.assertEqual(set(r), set(self.expected))
def _generate_arrays(self, data_iter): """ Inner template method that defines the generation of descriptor vectors for a given iterable of data elements. Pre-conditions: - Data elements input to this method have been validated to be of at least one of this class's reported ``valid_content_types``. :param collections.Iterable[DataElement] data_iter: Iterable of data element instances to be described. :raises RuntimeError: Descriptor extraction failure of some kind. :return: Iterable of numpy arrays in parallel association with the input data elements. :rtype: collections.Iterable[numpy.ndarray] """ log_debug = self._log.debug # Start parallel operation to pre-process imagery before aggregating # for network execution. # TODO: update ``buffer_factor`` param to account for batch size? img_array_iter = \ parallel_map(_process_load_img_array, zip( data_iter, itertools.repeat(self.transformer), itertools.repeat(self.data_layer), itertools.repeat(self.load_truncated_images), itertools.repeat(self.pixel_rescale), ), ordered=True, cores=self.threads) # Aggregate and process batches of input data elements #: :type: list[numpy.ndarray] batch_img_arrays = \ list(itertools.islice(img_array_iter, self.batch_size)) batch_i = 0 while len(batch_img_arrays) > 0: cur_batch_size = len(batch_img_arrays) log_debug("Batch {} - size {}".format(batch_i, cur_batch_size)) log_debug("Loading image numpy array into KWCNN Data object") self.data.set_data_list(batch_img_arrays, quiet=True) log_debug("Performing forward inference using KWCNN Network") test_results = self.network.test(quiet=True) descriptor_list = test_results['probability_list'] for v in descriptor_list: if v.ndim > 1: # In case kwcnn generates multidimensional array # (rows, 1, 1) log_debug("- Raveling output array of shape {}".format( v.shape)) yield numpy.ravel(v) else: yield v # Slice out the next batch #: :type: list[(collections.Hashable, numpy.ndarray)] batch_img_arrays = \ list(itertools.islice(img_array_iter, self.batch_size)) batch_i += 1
def test_simple_ordered_multiprocess(self): r = list(parallel_map(self.test_func, self.test_string, ordered=True, use_multiprocessing=True)) nose.tools.assert_equal(r, self.expected)