def build_index(self, descriptors): """ Build the index over the descriptor data elements. This in turn builds the configured hash index if one is set. Subsequent calls to this method should rebuild the index, not add to it, or raise an exception to as to protect the current index. Rebuilding the LSH index involves clearing the set descriptor index, key-value store and, if set, the hash index. :raises ValueError: No data available in the given iterable. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ if self.read_only: raise ReadOnlyError("Cannot modify container attributes due to " "being in read-only mode.") self._log.debug("Clearing and adding new descriptor elements") self.descriptor_index.clear() self.descriptor_index.add_many_descriptors(descriptors) self._log.debug("Generating hash codes") state = [0] * 7 hash_vectors = collections.deque() self.hash2uuids_kvstore.clear() for d in self.descriptor_index: h = self.lsh_functor.get_hash(d.vector()) hash_vectors.append(h) h_int = bit_vector_to_int_large(h) # Get, update and reinsert hash UUID set object #: :type: set hash_uuid_set = self.hash2uuids_kvstore.get(h_int, set()) hash_uuid_set.add(d.uuid()) self.hash2uuids_kvstore.add(h_int, hash_uuid_set) report_progress(self._log.debug, state, 1.0) state[1] -= 1 report_progress(self._log.debug, state, 0) if self.hash_index is not None: self._log.debug("Clearing and building hash index of type %s", type(self.hash_index)) # a build is supposed to clear previous state. self.hash_index.build_index(hash_vectors)
def main(): description = """ Utility for fetching remotely stored image paths from the JPL Solr index. Files will be transferred with their entire containing directories. For example, if the file was stored in "/data/things/image.png" remotely, it will be transferred locally to "<output_dir>/data/things/image.png". Assumptions: - JPL MEMEX Solr index key structure - `id` == "file:<abs-filepath>" - `mainType` is the first component of the MIMETYPE - `indexedAt` timestamp """ args, config = bin_utils.utility_main_helper(default_config, description, extend_parser) log = logging.getLogger(__name__) paths_file = args.paths_file after_time = args.after_time before_time = args.before_time # # Check dir/file locations # if paths_file is None: raise ValueError("Need a file path to to output transferred file " "paths!") file_utils.safe_create_dir(os.path.dirname(paths_file)) # # Start collection # remote_paths = solr_image_paths( config['solr_address'], after_time or '*', before_time or '*', config['solr_username'], config['solr_password'], config['batch_size'] ) log.info("Writing file paths") s = [0] * 7 with open(paths_file, 'w') as of: for rp in remote_paths: of.write(rp + '\n') bin_utils.report_progress(log.info, s, 1.) # Final report s[1] -= 1 bin_utils.report_progress(log.info, s, 0)
def main(): description = """ Utility for fetching remotely stored image paths from the JPL Solr index. Files will be transferred with their entire containing directories. For example, if the file was stored in "/data/things/image.png" remotely, it will be transferred locally to "<output_dir>/data/things/image.png". Assumptions: - JPL MEMEX Solr index key structure - `id` == "file:<abs-filepath>" - `mainType` is the first component of the MIMETYPE - `indexedAt` timestamp """ args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) paths_file = args.paths_file after_time = args.after_time before_time = args.before_time # # Check dir/file locations # if paths_file is None: raise ValueError("Need a file path to to output transferred file " "paths!") file_utils.safe_create_dir(os.path.dirname(paths_file)) # # Start collection # remote_paths = solr_image_paths( config['solr_address'], after_time or '*', before_time or '*', config['solr_username'], config['solr_password'], config['batch_size'] ) log.info("Writing file paths") s = [0] * 7 with open(paths_file, 'w') as of: for rp in remote_paths: of.write(rp + '\n') bin_utils.report_progress(log.info, s, 1.) # Final report s[1] -= 1 bin_utils.report_progress(log.info, s, 0)
def fit(self, descriptors, use_multiprocessing=True): """ Fit the ITQ model given the input set of descriptors :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :param use_multiprocessing: If multiprocessing should be used, as opposed to threading, for collecting descriptor vectors from the provided iterable. :type use_multiprocessing: bool :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None if self.get_logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds if not hasattr(descriptors, "__len__"): self._log.info("Creating sequence from iterable") descriptors_l = [] rs = [0]*7 for d in descriptors: descriptors_l.append(d) report_progress(self._log.debug, rs, dbg_report_interval) descriptors = descriptors_l self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix( descriptors, report_interval=dbg_report_interval, use_multiprocessing=use_multiprocessing) self._log.debug("descriptor matrix shape: %s", x.shape) n, dim = x.shape self._log.debug("Generating random projections") np.random.seed(self.random_seed) self.rps = np.random.randn(dim, self.bit_length) self._log.debug("Info normalizing descriptors with norm type: %s", self.normalize) return self.get_hash(x)
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) api_root = config['tool']['girder_api_root'] api_key = config['tool']['api_key'] api_query_batch = config['tool']['api_query_batch'] insert_batch_size = config['tool']['dataset_insert_batch_size'] # Collect N folder/item/file references on CL and any files referenced. #: :type: list[str] ids_folder = args.folder #: :type: list[str] ids_item = args.item #: :type: list[str] ids_file = args.file if args.folder_list: with open(args.folder_list) as f: ids_folder.extend([fid.strip() for fid in f]) if args.item_list: with open(args.item_list) as f: ids_item.extend([iid.strip() for iid in f]) if args.file_list: with open(args.file_list) as f: ids_file.extend([fid.strip() for fid in f]) #: :type: smqtk.representation.DataSet data_set = plugin.from_plugin_config(config['plugins']['data_set'], get_data_set_impls()) batch = collections.deque() rps = [0] * 7 for e in find_girder_files(api_root, ids_folder, ids_item, ids_file, api_key, api_query_batch): batch.append(e) if insert_batch_size and len(batch) >= insert_batch_size: data_set.add_data(*batch) batch.clear() bin_utils.report_progress(log.info, rps, 1.0) if batch: data_set.add_data(*batch) log.info('Done')
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) api_root = config['tool']['girder_api_root'] api_key = config['tool']['api_key'] api_query_batch = config['tool']['api_query_batch'] insert_batch_size = config['tool']['dataset_insert_batch_size'] # Collect N folder/item/file references on CL and any files referenced. #: :type: list[str] ids_folder = args.folder #: :type: list[str] ids_item = args.item #: :type: list[str] ids_file = args.file if args.folder_list: with open(args.folder_list) as f: ids_folder.extend([fid.strip() for fid in f]) if args.item_list: with open(args.item_list) as f: ids_item.extend([iid.strip() for iid in f]) if args.file_list: with open(args.file_list) as f: ids_file.extend([fid.strip() for fid in f]) #: :type: smqtk.representation.DataSet data_set = plugin.from_plugin_config(config['plugins']['data_set'], get_data_set_impls()) batch = collections.deque() rps = [0]*7 for e in find_girder_files(api_root, ids_folder, ids_item, ids_file, api_key, api_query_batch): batch.append(e) if insert_batch_size and len(batch) >= insert_batch_size: data_set.add_data(*batch) batch.clear() bin_utils.report_progress(log.info, rps, 1.0) if batch: data_set.add_data(*batch) log.info('Done')
def main(): args = cli_parser().parse_args() initialize_logging(logging.getLogger('smqtk'), logging.DEBUG) initialize_logging(logging.getLogger('__main__'), logging.DEBUG) log = logging.getLogger(__name__) hash2uuids_fp = os.path.abspath(args.hash2uuids_fp) bit_len = args.bit_len leaf_size = args.leaf_size rand_seed = args.rand_seed balltree_model_fp = os.path.abspath(args.balltree_model_fp) assert os.path.isfile(hash2uuids_fp), "Bad path: '%s'" % hash2uuids_fp assert os.path.isdir(os.path.dirname(balltree_model_fp)), \ "Bad path: %s" % balltree_model_fp log.debug("hash2uuids_fp : %s", hash2uuids_fp) log.debug("bit_len : %d", bit_len) log.debug("leaf_size : %d", leaf_size) log.debug("rand_seed : %d", rand_seed) log.debug("balltree_model_fp: %s", balltree_model_fp) log.info("Loading hash2uuids table") with open(hash2uuids_fp) as f: hash2uuids = cPickle.load(f) log.info("Computing hash-code vectors") hash_vectors = [] #[int_to_bit_vector_large(h, bit_len) for h in hash2uuids] rs = [0] * 7 for h in hash2uuids: hash_vectors.append( int_to_bit_vector_large(h, bit_len) ) report_progress(log.debug, rs, 1.) log.info("Initializing ball tree") btree = SkLearnBallTreeHashIndex(balltree_model_fp, leaf_size, rand_seed) log.info("Building ball tree") btree.build_index(hash_vectors)
def fit(self, descriptors): """ Fit the ITQ model given the input set of descriptors :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None if self.logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds if not hasattr(descriptors, "__len__"): self._log.info("Creating sequence from iterable") descriptors_l = [] rs = [0] * 7 for d in descriptors: descriptors_l.append(d) report_progress(self._log.debug, rs, dbg_report_interval) descriptors = descriptors_l self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix(descriptors, report_interval=dbg_report_interval) self._log.debug("descriptor matrix shape: %s", x.shape) self._log.debug("Info normalizing descriptors by factor: %s", self.normalize) x = self._norm_vector(x) self._log.info("Centering data") self.mean_vec = numpy.mean(x, axis=0) x -= self.mean_vec self._log.info("Computing PCA transformation") # numpy and matlab observation format is flipped, thus the added # transpose. self._log.debug("-- computing covariance") c = numpy.cov(x.transpose()) # Direct translation from UNC matlab code # - eigen vectors are the columns of ``pc`` self._log.debug('-- computing linalg.eig') l, pc = numpy.linalg.eig(c) # ordered by greatest eigenvalue magnitude, keeping top ``bit_len`` self._log.debug('-- computing top pairs') top_pairs = sorted(zip(l, pc.transpose()), key=lambda p: p[0], reverse=1)[:self.bit_length] # # Harry translation -- Uses singular values / vectors, not eigen # # - singular vectors are the rows of pc # pc, l, _ = numpy.linalg.svd(c) # top_pairs = sorted(zip(l, pc), # key=lambda p: p[0], # reverse=1 # )[:self.bit_length] # Eigen-vectors of top ``bit_len`` magnitude eigenvalues self._log.debug("-- top vector extraction") pc_top = numpy.array([p[1] for p in top_pairs]).transpose() self._log.debug("-- transform centered data by PC matrix") xx = numpy.dot(x, pc_top) self._log.info("Performing ITQ to find optimal rotation") c, self.rotation = self._find_itq_rotation(xx, self.itq_iterations) # De-adjust rotation with PC vector self.rotation = numpy.dot(pc_top, self.rotation) self.save_model() return c
def mb_kmeans_build_apply(index, mbkm, initial_fit_size): """ Build the MiniBatchKMeans centroids based on the descriptors in the given index, then predicting descriptor clusters with the final result model. If the given index is empty, no fitting or clustering occurs and an empty dictionary is returned. :param index: Index of descriptors :type index: smqtk.representation.DescriptorIndex :param mbkm: Scikit-Learn MiniBatchKMeans instead to train and then use for prediction :type mbkm: sklearn.cluster.MiniBatchKMeans :param initial_fit_size: Number of descriptors to run an initial fit with. This brings the advantage of choosing a best initialization point from multiple. :type initial_fit_size: int :return: Dictionary of the cluster label (integer) to the set of descriptor UUIDs belonging to that cluster. :rtype: dict[int, set[collections.Hashable]] """ log = logging.getLogger(__name__) ifit_completed = False k_deque = collections.deque() d_fitted = 0 log.info("Getting index keys (shuffled)") index_keys = sorted(six.iterkeys(index)) numpy.random.seed(mbkm.random_state) numpy.random.shuffle(index_keys) def parallel_iter_vectors(descriptors): """ Get the vectors for the descriptors given. Not caring about order returned. """ return parallel.parallel_map(lambda d: d.vector(), descriptors, use_multiprocessing=False) def get_vectors(k_iter): """ Get numpy array of descriptor vectors (2D array returned) """ return numpy.array(list( parallel_iter_vectors(index.get_many_descriptors(k_iter)) )) log.info("Collecting iteratively fitting model") rps = [0] * 7 for i, k in enumerate(index_keys): k_deque.append(k) bin_utils.report_progress(log.debug, rps, 1.) if initial_fit_size and not ifit_completed: if len(k_deque) == initial_fit_size: log.info("Initial fit using %d descriptors", len(k_deque)) log.info("- collecting vectors") vectors = get_vectors(k_deque) log.info("- fitting model") mbkm.fit(vectors) log.info("- cleaning") d_fitted += len(vectors) k_deque.clear() ifit_completed = True elif len(k_deque) == mbkm.batch_size: log.info("Partial fit with batch size %d", len(k_deque)) log.info("- collecting vectors") vectors = get_vectors(k_deque) log.info("- fitting model") mbkm.partial_fit(vectors) log.info("- cleaning") d_fitted += len(k_deque) k_deque.clear() # Final fit with any remaining descriptors if k_deque: log.info("Final partial fit of size %d", len(k_deque)) log.info('- collecting vectors') vectors = get_vectors(k_deque) log.info('- fitting model') mbkm.partial_fit(vectors) log.info('- cleaning') d_fitted += len(k_deque) k_deque.clear() log.info("Computing descriptor classes with final KMeans model") mbkm.verbose = False d_classes = collections.defaultdict(set) d_uv_iter = parallel.parallel_map(lambda d: (d.uuid(), d.vector()), index, use_multiprocessing=False, name="uv-collector") # TODO: Batch predict call inputs to something larger than one at a time. d_uc_iter = parallel.parallel_map( lambda u_v: (u_v[0], mbkm.predict(u_v[1][numpy.newaxis, :])[0]), d_uv_iter, use_multiprocessing=False, name="uc-collector") rps = [0] * 7 for uuid, c in d_uc_iter: d_classes[c].add(uuid) bin_utils.report_progress(log.debug, rps, 1.) rps[1] -= 1 bin_utils.report_progress(log.debug, rps, 0) return d_classes
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # - parallel_map UUIDs to load from the configured index # - classify iterated descriptors uuids_list_filepath = args.uuids_list output_csv_filepath = args.csv_data output_csv_header_filepath = args.csv_header classify_overwrite = config['utility']['classify_overwrite'] p_use_multiprocessing = \ config['utility']['parallel']['use_multiprocessing'] p_index_extraction_cores = \ config['utility']['parallel']['index_extraction_cores'] p_classification_cores = \ config['utility']['parallel']['classification_cores'] if not uuids_list_filepath: raise ValueError("No uuids_list_filepath specified.") elif not os.path.isfile(uuids_list_filepath): raise ValueError("Given uuids_list_filepath did not point to a file.") if output_csv_header_filepath is None: raise ValueError("Need a path to save CSV header labels") if output_csv_filepath is None: raise ValueError("Need a path to save CSV data.") # # Initialize configured plugins # log.info("Initializing descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls()) log.info("Initializing classification factory") c_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory']) log.info("Initializing classifier") #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config(config['plugins']['classifier'], get_classifier_impls()) # # Setup/Process # def iter_uuids(): with open(uuids_list_filepath) as f: for l in f: yield l.strip() def descr_for_uuid(uuid): """ :type uuid: collections.Hashable :rtype: smqtk.representation.DescriptorElement """ return descriptor_index.get_descriptor(uuid) def classify_descr(d): """ :type d: smqtk.representation.DescriptorElement :rtype: smqtk.representation.ClassificationElement """ return classifier.classify(d, c_factory, classify_overwrite) log.info("Initializing uuid-to-descriptor parallel map") #: :type: collections.Iterable[smqtk.representation.DescriptorElement] element_iter = parallel.parallel_map( descr_for_uuid, iter_uuids(), use_multiprocessing=p_use_multiprocessing, cores=p_index_extraction_cores, name="descr_for_uuid", ) log.info("Initializing descriptor-to-classification parallel map") #: :type: collections.Iterable[smqtk.representation.ClassificationElement] classification_iter = parallel.parallel_map( classify_descr, element_iter, use_multiprocessing=p_use_multiprocessing, cores=p_classification_cores, name='classify_descr', ) # # Write/Output files # c_labels = classifier.get_labels() def make_row(c): """ :type c: smqtk.representation.ClassificationElement """ c_m = c.get_classification() return [c.uuid] + [c_m[l] for l in c_labels] # column labels file log.info("Writing CSV column header file: %s", output_csv_header_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath)) with open(output_csv_header_filepath, 'wb') as f_csv: w = csv.writer(f_csv) w.writerow(['uuid'] + c_labels) # CSV file log.info("Writing CSV data file: %s", output_csv_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_filepath)) r_state = [0] * 7 with open(output_csv_filepath, 'wb') as f_csv: w = csv.writer(f_csv) for c in classification_iter: w.writerow(make_row(c)) bin_utils.report_progress(log.info, r_state, 1.0) # Final report r_state[1] -= 1 bin_utils.report_progress(log.info, r_state, 0) log.info("Done")
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None, check_image=False): """ Top level function handling configuration and inputs/outputs. :param c: Configuration dictionary (JSON) :type c: dict :param filelist_filepath: Path to a text file that lists paths to data files, separated by new lines. :type filelist_filepath: str :param checkpoint_filepath: Output file to which we write input filepath to SHA1 (UUID) relationships. :type checkpoint_filepath: :param batch_size: Optional batch size (None default) of data elements to process / descriptors to compute at a time. This causes files and stores to be written to incrementally during processing instead of one single batch transaction at a time. :type batch_size: :param check_image: Enable checking image loading from file before queueing that file for processing. If the check fails, the file is skipped instead of a halting exception being raised. :type check_image: bool """ log = logging.getLogger(__name__) file_paths = [l.strip() for l in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config(c['descriptor_index'], get_descriptor_index_impls()) data_set = None if c['optional_data_set']['type'] is None: log.info("Not saving loaded data elements to data set") else: log.info("Initializing data set to append to") #: :type: smqtk.representation.DataSet data_set = plugin.from_plugin_config(c['optional_data_set'], get_data_set_impls()) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) #: :type: smqtk.algorithms.DescriptorGenerator generator = plugin.from_plugin_config(c['descriptor_generator'], get_descriptor_generator_impls()) def iter_valid_elements(): def is_valid(file_path): dfe = DataFileElement(file_path) if is_valid_element( dfe, valid_content_types=generator.valid_content_types(), check_image=check_image): return dfe else: return False data_elements = collections.deque() valid_files_filter = parallel.parallel_map(is_valid, file_paths, name="check-file-type", use_multiprocessing=True) for dfe in valid_files_filter: if dfe: yield dfe if data_set is not None: data_elements.append(dfe) if batch_size and len(data_elements) == batch_size: log.debug( "Adding data element batch to set (size: %d)", len(data_elements)) data_set.add_data(*data_elements) data_elements.clear() # elements only collected if we have a data-set configured, so add any # still in the deque to the set if data_elements: log.debug("Adding data elements to set (size: %d", len(data_elements)) data_set.add_data(*data_elements) log.info("Computing descriptors") m = compute_many_descriptors( iter_valid_elements(), generator, factory, descriptor_index, batch_size=batch_size, ) # Recording computed file paths and associated file UUIDs (SHA1) cf = open(checkpoint_filepath, 'w') cf_writer = csv.writer(cf) try: rps = [0] * 7 for fp, descr in m: cf_writer.writerow([fp, descr.uuid()]) report_progress(log.debug, rps, 1.) finally: del cf_writer cf.close() log.info("Done")
def fit(self, descriptors): """ Fit the ITQ model given the input set of descriptors :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None if self.logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds if not hasattr(descriptors, "__len__"): self._log.info("Creating sequence from iterable") descriptors_l = [] rs = [0]*7 for d in descriptors: descriptors_l.append(d) report_progress(self._log.debug, rs, dbg_report_interval) descriptors = descriptors_l self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix(descriptors, report_interval=dbg_report_interval) self._log.debug("descriptor matrix shape: %s", x.shape) self._log.debug("Info normalizing descriptors by factor: %s", self.normalize) x = self._norm_vector(x) self._log.info("Centering data") self.mean_vec = numpy.mean(x, axis=0) x -= self.mean_vec self._log.info("Computing PCA transformation") # numpy and matlab observation format is flipped, thus the added # transpose. self._log.debug("-- computing covariance") c = numpy.cov(x.transpose()) # Direct translation from UNC matlab code # - eigen vectors are the columns of ``pc`` self._log.debug('-- computing linalg.eig') l, pc = numpy.linalg.eig(c) # ordered by greatest eigenvalue magnitude, keeping top ``bit_len`` self._log.debug('-- computing top pairs') top_pairs = sorted(zip(l, pc.transpose()), key=lambda p: p[0], reverse=1 )[:self.bit_length] # # Harry translation -- Uses singular values / vectors, not eigen # # - singular vectors are the rows of pc # pc, l, _ = numpy.linalg.svd(c) # top_pairs = sorted(zip(l, pc), # key=lambda p: p[0], # reverse=1 # )[:self.bit_length] # Eigen-vectors of top ``bit_len`` magnitude eigenvalues self._log.debug("-- top vector extraction") pc_top = numpy.array([p[1] for p in top_pairs]).transpose() self._log.debug("-- transform centered data by PC matrix") xx = numpy.dot(x, pc_top) self._log.info("Performing ITQ to find optimal rotation") c, self.rotation = self._find_itq_rotation(xx, self.itq_iterations) # De-adjust rotation with PC vector self.rotation = numpy.dot(pc_top, self.rotation) self.save_model() return c
def compute_descriptor_async(self, data_iter, descr_factory=DFLT_DESCRIPTOR_FACTORY, overwrite=False, procs=None, **kwds): """ Asynchronously compute feature data for multiple data items. :param data_iter: Iterable of data elements to compute features for. These must have UIDs assigned for feature association in return value. :type data_iter: collections.Iterable[smqtk.representation.DataElement] :param descr_factory: Factory instance to produce the wrapping descriptor element instance. The default factory produces ``DescriptorMemoryElement`` instances by default. :type descr_factory: smqtk.representation.DescriptorElementFactory :param overwrite: Whether or not to force re-computation of a descriptor vectors for the given data even when there exists precomputed vectors in the generated DescriptorElements as generated from the provided factory. This will overwrite the persistently stored vectors if the provided factory produces a DescriptorElement implementation such storage. :type overwrite: bool :param procs: Optional specification of how many processors to use when pooling sub-tasks. If None, we attempt to use all available cores. :type procs: int | None :raises ValueError: An input DataElement was of a content type that we cannot handle. :return: Mapping of input DataElement UUIDs to the computed descriptor element for that data. DescriptorElement UUID's are congruent with the UUID of the data element it is the descriptor of. :rtype: dict[collections.Hashable, smqtk.representation.DescriptorElement] """ self._set_caffe_mode() # Create DescriptorElement instances for each data elem. #: :type: dict[collections.Hashable, smqtk.representation.DataElement] data_elements = {} #: :type: dict[collections.Hashable, smqtk.representation.DescriptorElement] descr_elements = {} self._log.debug("Checking content types; aggregating data/descriptor " "elements.") prog_rep_state = [0] * 7 for data in data_iter: ct = data.content_type() if ct not in self.valid_content_types(): self._log.error("Cannot compute descriptor from content type " "'%s' data: %s)" % (ct, data)) raise ValueError("Cannot compute descriptor from content type " "'%s' data: %s)" % (ct, data)) data_elements[data.uuid()] = data descr_elements[data.uuid()] = \ descr_factory.new_descriptor(self.name, data.uuid()) report_progress(self._log.debug, prog_rep_state, 1.0) self._log.debug("Given %d unique data elements", len(data_elements)) # Reduce procs down to the number of elements to process if its smaller if len(data_elements) < (procs or multiprocessing.cpu_count()): procs = len(data_elements) if procs == 0: raise ValueError("No data elements provided") # For thread safely, only use .append() and .popleft() (queue) uuid4proc = deque() def check_get_uuid(descriptor_elem): if overwrite or not descriptor_elem.has_vector(): # noinspection PyUnresolvedReferences uuid4proc.append(descriptor_elem.uuid()) # Using thread-pool due to in-line function + updating local deque p = multiprocessing.pool.ThreadPool(procs) try: p.map(check_get_uuid, six.itervalues(descr_elements)) finally: p.close() p.join() del p self._log.debug("%d descriptors already computed", len(data_elements) - len(uuid4proc)) if uuid4proc: self._log.debug("Converting deque to tuple for segmentation") uuid4proc = tuple(uuid4proc) # Split UUIDs into groups equal to our batch size, and an option # tail group that is less than our batch size. tail_size = len(uuid4proc) % self.batch_size batch_groups = (len(uuid4proc) - tail_size) // self.batch_size self._log.debug("Processing %d batches of size %d", batch_groups, self.batch_size) if tail_size: self._log.debug("Processing tail group of size %d", tail_size) if batch_groups: for g in range(batch_groups): self._log.debug("Starting batch: %d of %d", g + 1, batch_groups) batch_uuids = \ uuid4proc[g * self.batch_size:(g + 1) * self.batch_size] self._process_batch(batch_uuids, data_elements, descr_elements, procs, kwds.get('use_mp', True)) if tail_size: batch_uuids = uuid4proc[-tail_size:] self._log.debug("Starting tail batch (size=%d)", len(batch_uuids)) self._process_batch(batch_uuids, data_elements, descr_elements, procs, kwds.get('use_mp', True)) self._log.debug("forming output dict") return dict((data_elements[k].uuid(), descr_elements[k]) for k in data_elements)
dl_image, iter_scan_meta(), name='image_download', use_multiprocessing=True, cores=cores ) # Write out log.info("Starting iteration/file-write") rp_state = [0] * 7 with open(scan_record, 'w') as record_file: for r in img_dl_records: if r is not None: cdr_id, local_path, uuid = r record_file.write('%s,%s,%s\n' % (cdr_id, local_path, uuid)) report_progress(log.debug, rp_state, 1.0) # Final report rp_state[1] -= 1 report_progress(log.debug, rp_state, 0) def default_config(): return { "image_types": ['jpeg', 'png', 'tiff'], "elastic_search": { "instance_address": "CHANGEME", "index": "CHANGEME", "username": "******", "password": "******", "batch_size": 10000, },
def compute_distance_kernel(m, dist_func, row_wise=False, parallel=True): """ Method for computing the distance kernel of an array of vectors given a distance function that works on two supplied 1D arrays. For a valid distance function interface, see ``smqtk.utils.distance_functions.histogram_intersection_distance2``. :param m: An array of vectors to compute the pairwise distance kernel for. :type m: numpy.core.multiarray.ndarray :param dist_func: Distance function :type dist_func: (ndarray, ndarray) -> ndarray[float] | float :param row_wise: If the given distance function can take a vector and a matrix, and computes pair-wise distances, returning a vector of distances between the given vector and each row of the matrix. :type row_wise: bool :param parallel: If distances should be calculated in parallel. This is true by default. :type parallel: bool :return: Computed symmetric distance kernel :rtype: numpy.core.multiarray.ndarray """ if hasattr(dist_func, 'im_func'): # noinspection PyUnresolvedReferences distance_name = '.'.join([ dist_func.__module__, dist_func.im_class.__name__, dist_func.im_func.func_name ]) elif hasattr(dist_func, 'func_name'): # noinspection PyUnresolvedReferences distance_name = '.'.join([dist_func.__module__, dist_func.func_name]) elif hasattr(dist_func, 'py_func') \ and hasattr(dist_func.py_func, 'func_name'): distance_name = '.'.join( [dist_func.__module__, dist_func.py_func.func_name]) else: distance_name = "<unknown>" log = logging.getLogger('compute_distance_kernel[%s]' % distance_name) if m.ndim == 1: m = m[np.newaxis] log.info("Computing distance kernel") side = m.shape[0] mat = np.ndarray((side, side), dtype=float) if row_wise: log.debug("Computing row-wise distances") # For all rows except the last one. We'll have computed all distanced by # the time reach m[side-1] if parallel: def work_func(i): mat[i, i] = 0. if i < (side - 1): mat[i + 1:, i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :]) # Using threading for in-place modification s = [0] * 7 for _ in parallel_map(work_func, xrange(side), use_multiprocessing=False): report_progress(log.debug, s, 1.) else: for i in xrange(side): # Compute col/row wise distances mat[i, i] = 0. if i < (side - 1): mat[i + 1:, i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :]) else: log.debug("Computing element-wise distances") if parallel: def work_func(i): mat[i, i] = 0 # cols to the left of diagonal index for this row for j in xrange(i): mat[i, j] = mat[j, i] = dist_func(m[i], m[j]) # Using threading for in-place modification s = [0] * 7 for _ in parallel_map(work_func, xrange(side), use_multiprocessing=False): report_progress(log.debug, s, 1.) else: for i in xrange(side): mat[i, i] = 0 # cols to the left of diagonal index for this row for j in xrange(i): mat[i, j] = mat[j, i] = dist_func(m[i], m[j]) return mat
log.info("Initializing image download/record parallel iterator") img_dl_records = parallel_map(dl_image, iter_scan_meta(), name='image_download', use_multiprocessing=True, cores=cores) # Write out log.info("Starting iteration/file-write") rp_state = [0] * 7 with open(scan_record, 'w') as record_file: for r in img_dl_records: if r is not None: cdr_id, local_path, uuid = r record_file.write('%s,%s,%s\n' % (cdr_id, local_path, uuid)) report_progress(log.debug, rp_state, 1.0) # Final report rp_state[1] -= 1 report_progress(log.debug, rp_state, 0) def default_config(): return { "image_types": ['jpeg', 'png', 'tiff'], "elastic_search": { "instance_address": "CHANGEME", "index": "CHANGEME", "username": "******", "password": "******", "batch_size": 10000, },
def fit(self, descriptors, use_multiprocessing=True): """ Fit the ITQ model given the input set of descriptors. :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None if self.get_logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds if not isinstance(descriptors, Sequence): self._log.info("Creating sequence from iterable") descriptors_l = [] rs = [0] * 7 for d in descriptors: descriptors_l.append(d) report_progress(self._log.debug, rs, dbg_report_interval) descriptors = descriptors_l if len(descriptors[0].vector()) < self.bit_length: raise ValueError("Input descriptors have fewer features than " "requested bit encoding. Hash codes will be " "smaller than requested due to PCA decomposition " "result being bound by number of features.") self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix(descriptors, report_interval=dbg_report_interval, use_multiprocessing=use_multiprocessing) self._log.debug("descriptor matrix shape: %s", x.shape) self._log.debug("Info normalizing descriptors by factor: %s", self.normalize) x = self._norm_vector(x) self._log.info("Centering data") self.mean_vec = numpy.mean(x, axis=0) x -= self.mean_vec self._log.info("Computing PCA transformation") self._log.debug("-- computing covariance") # ``cov`` wants each row to be a feature and each column an observation # of those features. Thus, each column should be a descriptor vector, # thus we need the transpose here. c = numpy.cov(x.transpose()) if True: # Direct translation from UNC matlab code # - eigen vectors are the columns of ``pc`` self._log.debug('-- computing linalg.eig') l, pc = numpy.linalg.eig(c) self._log.debug('-- ordering eigen vectors by descending eigen ' 'value') else: # Harry translation -- Uses singular values / vectors, not eigen # - singular vectors are the columns of pc self._log.debug('-- computing linalg.svd') pc, l, _ = numpy.linalg.svd(c) self._log.debug('-- ordering singular vectors by descending ' 'singular value') # Same ordering method for both eig/svd sources. l_pc_ordered = sorted(zip(l, pc.transpose()), key=lambda p: p[0], reverse=1) self._log.debug("-- top vector extraction") # Only keep the top ``bit_length`` vectors after ordering by descending # value magnitude. # - Transposing vectors back to column-vectors. pc_top = numpy.array([p[1] for p in l_pc_ordered[:self.bit_length]])\ .transpose() self._log.debug("-- project centered data by PC matrix") v = numpy.dot(x, pc_top) self._log.info("Performing ITQ to find optimal rotation") c, self.rotation = self._find_itq_rotation(v, self.itq_iterations) # De-adjust rotation with PC vector self.rotation = numpy.dot(pc_top, self.rotation) self.save_model() return c
def main(): description = """ Script for asynchronously computing classifications for DescriptorElements in a DescriptorIndex specified via a list of UUIDs. Results are output to a CSV file in the format: uuid, label1_confidence, label2_confidence, ... CSV columns labels are output to the given CSV header file path. Label columns will be in the order as reported by the classifier implementations ``get_labels`` method. Due to using an input file-list of UUIDs, we require that the UUIDs of indexed descriptors be strings, or equality comparable to the UUIDs' string representation. """ args, config = bin_utils.utility_main_helper( default_config, description, extend_parser, ) log = logging.getLogger(__name__) # - parallel_map UUIDs to load from the configured index # - classify iterated descriptors uuids_list_filepath = args.uuids_list output_csv_filepath = args.csv_data output_csv_header_filepath = args.csv_header classify_overwrite = config['utility']['classify_overwrite'] p_use_multiprocessing = \ config['utility']['parallel']['use_multiprocessing'] p_index_extraction_cores = \ config['utility']['parallel']['index_extraction_cores'] p_classification_cores = \ config['utility']['parallel']['classification_cores'] if not uuids_list_filepath: raise ValueError("No uuids_list_filepath specified.") elif not os.path.isfile(uuids_list_filepath): raise ValueError("Given uuids_list_filepath did not point to a file.") if output_csv_header_filepath is None: raise ValueError("Need a path to save CSV header labels") if output_csv_filepath is None: raise ValueError("Need a path to save CSV data.") # # Initialize configured plugins # log.info("Initializing descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls() ) log.info("Initializing classification factory") c_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory'] ) log.info("Initializing classifier") #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config( config['plugins']['classifier'], get_classifier_impls() ) # # Setup/Process # def iter_uuids(): with open(uuids_list_filepath) as f: for l in f: yield l.strip() def descr_for_uuid(uuid): """ :type uuid: collections.Hashable :rtype: smqtk.representation.DescriptorElement """ return descriptor_index.get_descriptor(uuid) def classify_descr(d): """ :type d: smqtk.representation.DescriptorElement :rtype: smqtk.representation.ClassificationElement """ return classifier.classify(d, c_factory, classify_overwrite) log.info("Initializing uuid-to-descriptor parallel map") #: :type: collections.Iterable[smqtk.representation.DescriptorElement] element_iter = parallel.parallel_map( descr_for_uuid, iter_uuids(), use_multiprocessing=p_use_multiprocessing, cores=p_index_extraction_cores, name="descr_for_uuid", ) log.info("Initializing descriptor-to-classification parallel map") #: :type: collections.Iterable[smqtk.representation.ClassificationElement] classification_iter = parallel.parallel_map( classify_descr, element_iter, use_multiprocessing=p_use_multiprocessing, cores=p_classification_cores, name='classify_descr', ) # # Write/Output files # c_labels = classifier.get_labels() def make_row(c): """ :type c: smqtk.representation.ClassificationElement """ c_m = c.get_classification() return [c.uuid] + [c_m[l] for l in c_labels] # column labels file log.info("Writing CSV column header file: %s", output_csv_header_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath)) with open(output_csv_header_filepath, 'wb') as f_csv: w = csv.writer(f_csv) w.writerow(['uuid'] + c_labels) # CSV file log.info("Writing CSV data file: %s", output_csv_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_filepath)) r_state = [0] * 7 with open(output_csv_filepath, 'wb') as f_csv: w = csv.writer(f_csv) for c in classification_iter: w.writerow(make_row(c)) bin_utils.report_progress(log.info, r_state, 1.0) # Final report r_state[1] -= 1 bin_utils.report_progress(log.info, r_state, 0) log.info("Done")
m = compute_many_descriptors(iter_valid_elements(), generator, factory, descriptor_index, batch_size=batch_size, ) # Recording computed file paths and associated file UUIDs (SHA1) cf = open(checkpoint_filepath, 'w') try: rps = [0] * 7 for fp, descr in m: cf.write("{:s},{:s}\n".format( fp, descr.uuid() )) report_progress(log.debug, rps, 1.) # Final report rps[1] -= 1 report_progress(log.debug, rps, 0.) finally: cf.close() log.info("Done") def extend_parser(parser): parser.add_argument('-b', '--batch-size', type=int, default=256, metavar='INT', help="Number of files to batch together into a single " "compute async call. This defines the " "granularity of the checkpoint file in regards "
def compute_descriptor_async(self, data_iter, descr_factory=DFLT_DESCRIPTOR_FACTORY, overwrite=False, procs=None, **kwds): """ Asynchronously compute feature data for multiple data items. :param data_iter: Iterable of data elements to compute features for. These must have UIDs assigned for feature association in return value. :type data_iter: collections.Iterable[smqtk.representation.DataElement] :param descr_factory: Factory instance to produce the wrapping descriptor element instances. In-Memory descriptor factory by default. :type descr_factory: smqtk.representation.DescriptorElementFactory :param overwrite: Whether or not to force re-computation of a descriptor vectors for the given data even when there exists precomputed vectors in the generated DescriptorElements as generated from the provided factory. This will overwrite the persistently stored vectors if the provided factory produces a DescriptorElement implementation such storage. :type overwrite: bool :param procs: Optional specification of how many processors to use when pooling sub-tasks. If None, we attempt to use all available cores. :type procs: int :raises ValueError: An input DataElement was of a content type that we cannot handle. :return: Mapping of input DataElement instances to the computed descriptor element. DescriptorElement UUID's are congruent with the UUID of the data element it is the descriptor of. :rtype: dict[smqtk.representation.DataElement, smqtk.representation.DescriptorElement] """ # Create DescriptorElement instances for each data elem. #: :type: dict[collections.Hashable, smqtk.representation.DataElement] data_elements = {} #: :type: dict[collections.Hashable, smqtk.representation.DescriptorElement] descr_elements = {} self._log.debug("Checking content types; aggregating data/descriptor " "elements.") prog_rep_state = [0] * 7 for d in data_iter: ct = d.content_type() if ct not in self.valid_content_types(): raise ValueError("Cannot compute descriptor of content type " "'%s', (DE: %s" % (ct, d)) data_elements[d.uuid()] = d descr_elements[d.uuid()] = descr_factory.new_descriptor(self.name, d.uuid()) report_progress(self._log.debug, prog_rep_state, 1.0) self._log.debug("Given %d unique data elements", len(data_elements)) # Reduce procs down to the number of elements to process if its smaller if len(data_elements) < (procs or multiprocessing.cpu_count()): procs = len(data_elements) # For thread safely, only use .append() and .popleft() (queue) uuid4proc = deque() def check_get_uuid(d): if overwrite or not d.has_vector(): # noinspection PyUnresolvedReferences uuid4proc.append(d.uuid()) p = multiprocessing.pool.ThreadPool(procs) try: p.map(check_get_uuid, descr_elements.itervalues()) finally: p.close() p.join() del p self._log.debug("%d descriptors already computed", len(data_elements) - len(uuid4proc)) if uuid4proc: self._log.debug("Converting deque to tuple for segmentation") uuid4proc = tuple(uuid4proc) # Split UUIDs into groups equal to our batch size, and an option # tail group that is less than our batch size. tail_size = len(uuid4proc) % self.batch_size batch_groups = (len(uuid4proc) - tail_size) // self.batch_size self._log.debug("Processing %d batches of size %d", batch_groups, self.batch_size) if tail_size: self._log.debug("Processing tail group of size %d", tail_size) if batch_groups: for g in xrange(batch_groups): self._log.debug("Starting batch: %d of %d", g + 1, batch_groups) batch_uuids = \ uuid4proc[g*self.batch_size:(g+1)*self.batch_size] self._process_batch(batch_uuids, data_elements, descr_elements, procs) if tail_size: batch_uuids = uuid4proc[-tail_size:] self._log.debug("Starting tail batch (size=%d)", len(batch_uuids)) self._process_batch(batch_uuids, data_elements, descr_elements, procs) self._log.debug("forming output dict") return dict((data_elements[k], descr_elements[k]) for k in data_elements)
def compute_distance_kernel(m, dist_func, row_wise=False, parallel=True): """ Method for computing the distance kernel of an array of vectors given a distance function that works on two supplied 1D arrays. For a valid distance function interface, see ``smqtk.utils.distance_functions.histogram_intersection_distance2``. :param m: An array of vectors to compute the pairwise distance kernel for. :type m: numpy.ndarray :param dist_func: Distance function :type dist_func: (ndarray, ndarray) -> ndarray[float] | float :param row_wise: If the given distance function can take a vector and a matrix, and computes pair-wise distances, returning a vector of distances between the given vector and each row of the matrix. :type row_wise: bool :param parallel: If distances should be calculated in parallel. This is true by default. :type parallel: bool :return: Computed symmetric distance kernel :rtype: numpy.ndarray """ log = logging.getLogger(__name__) if m.ndim == 1: m = m[np.newaxis] log.info("Computing distance kernel") side = m.shape[0] mat = np.ndarray((side, side), dtype=float) s = [0] * 7 if row_wise: log.debug("Computing row-wise distances") # For all rows except the last one. We'll have computed all distances by # the time reach m[side-1] if parallel: # noinspection PyShadowingNames def work_func(i): mat[i, i] = dist_func(m[i], m[i]) if i < (side - 1): mat[i + 1:, i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :]) # Using threading for in-place modification s = [0] * 7 for _ in parallel_map(work_func, range(side), use_multiprocessing=False): report_progress(log.debug, s, 1.) else: for i in range(side): # Compute col/row wise distances mat[i, i] = dist_func(m[i], m[i]) if i < (side-1): mat[i+1:, i] = mat[i, i+1:] = dist_func(m[i, :], m[i+1:, :]) report_progress(log.debug, s, 1.) else: log.debug("Computing element-wise distances") if parallel: # noinspection PyShadowingNames def work_func(i): mat[i, i] = dist_func(m[i], m[i]) # cols to the left of diagonal index for this row for j in range(i): mat[i, j] = mat[j, i] = dist_func(m[i], m[j]) # Using threading for in-place modification for _ in parallel_map(work_func, range(side), use_multiprocessing=False): report_progress(log.debug, s, 1.) else: for i in range(side): mat[i, i] = dist_func(m[i], m[i]) # cols to the left of diagonal index for this row for j in range(i): mat[i, j] = mat[j, i] = dist_func(m[i], m[j]) report_progress(log.debug, s, 1.) return mat
log.info("Computing descriptors") m = compute_many_descriptors( iter_valid_elements(), generator, factory, descriptor_index, batch_size=batch_size, ) # Recording computed file paths and associated file UUIDs (SHA1) cf = open(checkpoint_filepath, 'w') try: rps = [0] * 7 for fp, descr in m: cf.write("{:s},{:s}\n".format(fp, descr.uuid())) report_progress(log.debug, rps, 1.) finally: cf.close() log.info("Done") def cli_parser(): parser = basic_cli_parser(__doc__) parser.add_argument('-b', '--batch-size', type=int, default=0, metavar='INT', help="Number of files to batch together into a single "
def main(): args = cli_parser().parse_args() config = bin_utils.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # - parallel_map UUIDs to load from the configured index # - classify iterated descriptors uuids_list_filepath = args.uuids_list output_csv_filepath = args.csv_data output_csv_header_filepath = args.csv_header classify_overwrite = config['utility']['classify_overwrite'] p_use_multiprocessing = \ config['utility']['parallel']['use_multiprocessing'] p_index_extraction_cores = \ config['utility']['parallel']['index_extraction_cores'] p_classification_cores = \ config['utility']['parallel']['classification_cores'] if not uuids_list_filepath: raise ValueError("No uuids_list_filepath specified.") elif not os.path.isfile(uuids_list_filepath): raise ValueError("Given uuids_list_filepath did not point to a file.") if output_csv_header_filepath is None: raise ValueError("Need a path to save CSV header labels") if output_csv_filepath is None: raise ValueError("Need a path to save CSV data.") # # Initialize configured plugins # log.info("Initializing descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config( config['plugins']['descriptor_index'], get_descriptor_index_impls() ) log.info("Initializing classification factory") c_factory = ClassificationElementFactory.from_config( config['plugins']['classification_factory'] ) log.info("Initializing classifier") #: :type: smqtk.algorithms.Classifier classifier = plugin.from_plugin_config( config['plugins']['classifier'], get_classifier_impls() ) # # Setup/Process # def iter_uuids(): with open(uuids_list_filepath) as f: for l in f: yield l.strip() def descr_for_uuid(uuid): """ :type uuid: collections.Hashable :rtype: smqtk.representation.DescriptorElement """ return descriptor_index.get_descriptor(uuid) def classify_descr(d): """ :type d: smqtk.representation.DescriptorElement :rtype: smqtk.representation.ClassificationElement """ return classifier.classify(d, c_factory, classify_overwrite) log.info("Initializing uuid-to-descriptor parallel map") #: :type: collections.Iterable[smqtk.representation.DescriptorElement] element_iter = parallel.parallel_map( descr_for_uuid, iter_uuids(), use_multiprocessing=p_use_multiprocessing, cores=p_index_extraction_cores, name="descr_for_uuid", ) log.info("Initializing descriptor-to-classification parallel map") #: :type: collections.Iterable[smqtk.representation.ClassificationElement] classification_iter = parallel.parallel_map( classify_descr, element_iter, use_multiprocessing=p_use_multiprocessing, cores=p_classification_cores, name='classify_descr', ) # # Write/Output files # c_labels = classifier.get_labels() def make_row(e): """ :type e: smqtk.representation.ClassificationElement """ c_m = e.get_classification() return [e.uuid] + [c_m[l] for l in c_labels] # column labels file log.info("Writing CSV column header file: %s", output_csv_header_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_header_filepath)) with open(output_csv_header_filepath, 'wb') as f_csv: w = csv.writer(f_csv) w.writerow(['uuid'] + [str(cl) for cl in c_labels]) # CSV file log.info("Writing CSV data file: %s", output_csv_filepath) file_utils.safe_create_dir(os.path.dirname(output_csv_filepath)) r_state = [0] * 7 with open(output_csv_filepath, 'wb') as f_csv: w = csv.writer(f_csv) for c in classification_iter: w.writerow(make_row(c)) bin_utils.report_progress(log.info, r_state, 1.0) # Final report r_state[1] -= 1 bin_utils.report_progress(log.info, r_state, 0) log.info("Done")