def fit(self, descriptors, use_multiprocessing=True): """ Fit the ITQ model given the input set of descriptors :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :param use_multiprocessing: If multiprocessing should be used, as opposed to threading, for collecting descriptor vectors from the provided iterable. :type use_multiprocessing: bool :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = None if self.get_logger().getEffectiveLevel() <= logging.DEBUG: dbg_report_interval = 1.0 # seconds if not hasattr(descriptors, "__len__"): self._log.info("Creating sequence from iterable") descriptors_l = [] pr = ProgressReporter(self._log.debug, dbg_report_interval).start() for d in descriptors: descriptors_l.append(d) dbg_report_interval and pr.increment_report() dbg_report_interval and pr.report() descriptors = descriptors_l self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix(descriptors, report_interval=dbg_report_interval, use_multiprocessing=use_multiprocessing) self._log.debug("descriptor matrix shape: %s", x.shape) n, dim = x.shape self._log.debug("Generating random projections") np.random.seed(self.random_seed) self.rps = np.random.randn(dim, self.bit_length) self._log.debug("Info normalizing descriptors with norm type: %s", self.normalize) return self.get_hash(x)
def _update_index(self, descriptors): """ Internal method to be implemented by sub-classes to additively update the current index with the one or more descriptor elements given. If no index exists yet, a new one should be created using the given descriptors. :raises ReadOnlyError: This index is set to be read-only and cannot be modified. :param descriptors: Iterable of descriptor elements to add to this index. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ with self._model_lock: if self.read_only: raise ReadOnlyError("Cannot modify container attributes due " "to being in read-only mode.") # tee out iterable for use in adding to index as well as hash code # generation. d_for_index, d_for_hashing = itertools.tee(descriptors, 2) self._log.debug("Updating descriptor index.") self.descriptor_set.add_many_descriptors(d_for_index) self._log.debug("Generating hash codes for new descriptors") prog_reporter = ProgressReporter(self._log.debug, 1.0).start() #: :type: collections.deque[numpy.ndarray[bool]] hash_vectors = collections.deque() # for updating hash_index # for updating kv-store after collecting new hash codes kvstore_update = {} for d in d_for_hashing: h_vec = self.lsh_functor.get_hash(d.vector()) hash_vectors.append(h_vec) h_int = bit_vector_to_int_large(h_vec) # Get, update and reinsert hash UUID set object. if h_int not in kvstore_update: #: :type: set kvstore_update[h_int] = \ self.hash2uuids_kvstore.get(h_int, set()) kvstore_update[h_int] |= {d.uuid()} prog_reporter.increment_report() prog_reporter.report() self._log.debug("Updating kv-store with new hash codes") self.hash2uuids_kvstore.add_many(kvstore_update) del kvstore_update if self.hash_index is not None: self._log.debug("Updating hash index structure.") self.hash_index.update_index(hash_vectors)
def _build_index(self, descriptors): """ Internal method to be implemented by sub-classes to build the index with the given descriptor data elements. Subsequent calls to this method should rebuild the current index. This method shall not add to the existing index nor raise an exception to as to protect the current index. :raises ReadOnlyError: This index is set to be read-only and cannot be modified. :param descriptors: Iterable of descriptor elements to build index over. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ with self._model_lock: if self.read_only: raise ReadOnlyError( "Cannot modify container attributes due to " "being in read-only mode.") self._log.debug("Clearing and adding new descriptor elements") self.descriptor_index.clear() self.descriptor_index.add_many_descriptors(descriptors) self._log.debug("Generating hash codes") #: :type: collections.deque[numpy.ndarray[bool]] hash_vectors = collections.deque() self.hash2uuids_kvstore.clear() prog_reporter = ProgressReporter(self._log.debug, 1.0).start() # We just cleared the previous store, so aggregate new kv-mapping # in ``kvstore_update`` for single update after loop. kvstore_update = collections.defaultdict(set) for d in self.descriptor_index: h_vec = self.lsh_functor.get_hash(d.vector()) hash_vectors.append(h_vec) h_int = bit_vector_to_int_large(h_vec) kvstore_update[h_int] |= {d.uuid()} prog_reporter.increment_report() prog_reporter.report() self.hash2uuids_kvstore.add_many(kvstore_update) del kvstore_update if self.hash_index is not None: self._log.debug("Clearing and building hash index of type %s", type(self.hash_index)) # a build is supposed to clear previous state. self.hash_index.build_index(hash_vectors)
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None, check_image=False): """ Top level function handling configuration and inputs/outputs. :param c: Configuration dictionary (JSON) :type c: dict :param filelist_filepath: Path to a text file that lists paths to data files, separated by new lines. :type filelist_filepath: str :param checkpoint_filepath: Output file to which we write input filepath to SHA1 (UUID) relationships. :type checkpoint_filepath: :param batch_size: Optional batch size (None default) of data elements to process / descriptors to compute at a time. This causes files and stores to be written to incrementally during processing instead of one single batch transaction at a time. :type batch_size: :param check_image: Enable checking image loading from file before queueing that file for processing. If the check fails, the file is skipped instead of a halting exception being raised. :type check_image: bool """ log = logging.getLogger(__name__) file_paths = [line.strip() for line in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor index") descriptor_set = cast( DescriptorSet, from_config_dict(c['descriptor_set'], DescriptorSet.get_impls())) # ``data_set`` added to within the ``iter_valid_elements`` function. data_set: Optional[DataSet] = None if c['optional_data_set']['type'] is None: log.info("Not saving loaded data elements to data set") else: log.info("Initializing data set to append to") data_set = cast( DataSet, from_config_dict(c['optional_data_set'], DataSet.get_impls())) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) generator = cast( DescriptorGenerator, from_config_dict(c['descriptor_generator'], DescriptorGenerator.get_impls())) def iter_valid_elements(): def is_valid(file_path): e = DataFileElement(file_path) if is_valid_element( e, valid_content_types=generator.valid_content_types(), check_image=check_image): return e else: return False data_elements: Deque[DataFileElement] = collections.deque() valid_files_filter = parallel.parallel_map(is_valid, file_paths, name="check-file-type", use_multiprocessing=True) for dfe in valid_files_filter: if dfe: yield dfe if data_set is not None: data_elements.append(dfe) if batch_size and len(data_elements) == batch_size: log.debug( "Adding data element batch to set (size: %d)", len(data_elements)) data_set.add_data(*data_elements) data_elements.clear() # elements only collected if we have a data-set configured, so add any # still in the deque to the set if data_set is not None and data_elements: log.debug("Adding data elements to set (size: %d", len(data_elements)) data_set.add_data(*data_elements) log.info("Computing descriptors") m = compute_many_descriptors( iter_valid_elements(), generator, factory, descriptor_set, batch_size=batch_size, ) # Recording computed file paths and associated file UUIDs (SHA1) cf = open(checkpoint_filepath, 'w') cf_writer = csv.writer(cf) try: pr = ProgressReporter(log.debug, 1.0).start() for de, descr in m: # We know that we are using DataFileElements going into the # compute_many_descriptors, so we can assume that's what comes out # of it as well. # noinspection PyProtectedMember cf_writer.writerow([de._filepath, descr.uuid()]) pr.increment_report() pr.report() finally: del cf_writer cf.close() log.info("Done")
def compute_distance_kernel(m, dist_func, row_wise=False, parallel=True): """ Method for computing the distance kernel of an array of vectors given a distance function that works on two supplied 1D arrays. For a valid distance function interface, see ``smqtk.utils.distance_functions.histogram_intersection_distance2``. :param m: An array of vectors to compute the pairwise distance kernel for. :type m: numpy.ndarray :param dist_func: Distance function :type dist_func: (ndarray, ndarray) -> ndarray[float] | float :param row_wise: If the given distance function can take a vector and a matrix, and computes pair-wise distances, returning a vector of distances between the given vector and each row of the matrix. :type row_wise: bool :param parallel: If distances should be calculated in parallel. This is true by default. :type parallel: bool :return: Computed symmetric distance kernel :rtype: numpy.ndarray """ log = logging.getLogger(__name__) if m.ndim == 1: m = m[np.newaxis] log.info("Computing distance kernel") side = m.shape[0] mat = np.ndarray((side, side), dtype=float) pr = ProgressReporter(log.debug, 1.0) if row_wise: log.debug("Computing row-wise distances") # For all rows except the last one. We'll have computed all distances by # the time reach m[side-1] if parallel: # noinspection PyShadowingNames def work_func(i): mat[i, i] = dist_func(m[i], m[i]) if i < (side - 1): mat[i + 1:, i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :]) # Using threading for in-place modification pr.start() for _ in parallel_map(work_func, range(side), use_multiprocessing=False): pr.increment_report() else: pr.start() for i in range(side): # Compute col/row wise distances mat[i, i] = dist_func(m[i], m[i]) if i < (side - 1): mat[i + 1:, i] = mat[i, i + 1:] = dist_func(m[i, :], m[i + 1:, :]) pr.increment_report() else: log.debug("Computing element-wise distances") if parallel: # noinspection PyShadowingNames def work_func(i): mat[i, i] = dist_func(m[i], m[i]) # cols to the left of diagonal index for this row for j in range(i): mat[i, j] = mat[j, i] = dist_func(m[i], m[j]) # Using threading for in-place modification pr.start() for _ in parallel_map(work_func, range(side), use_multiprocessing=False): pr.increment_report() else: pr.start() for i in range(side): mat[i, i] = dist_func(m[i], m[i]) # cols to the left of diagonal index for this row for j in range(i): mat[i, j] = mat[j, i] = dist_func(m[i], m[j]) pr.increment_report() pr.report() return mat
def fit(self, descriptors, use_multiprocessing=True): """ Fit the ITQ model given the input set of descriptors. :param descriptors: Iterable of ``DescriptorElement`` vectors to fit the model to. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] :param use_multiprocessing: If multiprocessing should be used, as opposed to threading, when collecting descriptor elements from the given iterable. :type use_multiprocessing: bool :raises RuntimeError: There is already a model loaded :return: Matrix hash codes for provided descriptors in order. :rtype: numpy.ndarray[bool] """ if self.has_model(): raise RuntimeError("Model components have already been loaded.") dbg_report_interval = 1.0 dbg_report = self.get_logger().getEffectiveLevel() <= logging.DEBUG if not isinstance(descriptors, Sequence): self._log.info("Creating sequence from iterable") descriptors_l = [] pr = ProgressReporter(self._log.debug, dbg_report_interval).start() for d in descriptors: descriptors_l.append(d) dbg_report and pr.increment_report() dbg_report and pr.report() descriptors = descriptors_l if len(descriptors[0].vector()) < self.bit_length: raise ValueError("Input descriptors have fewer features than " "requested bit encoding. Hash codes will be " "smaller than requested due to PCA decomposition " "result being bound by number of features.") self._log.info("Creating matrix of descriptors for fitting") x = elements_to_matrix(descriptors, report_interval=dbg_report_interval, use_multiprocessing=use_multiprocessing) self._log.debug("descriptor matrix shape: %s", x.shape) self._log.debug("Info normalizing descriptors by factor: %s", self.normalize) x = self._norm_vector(x) self._log.info("Centering data") self.mean_vec = numpy.mean(x, axis=0) x -= self.mean_vec self._log.info("Computing PCA transformation") self._log.debug("-- computing covariance") # ``cov`` wants each row to be a feature and each column an observation # of those features. Thus, each column should be a descriptor vector, # thus we need the transpose here. c = numpy.cov(x.transpose()) # Direct translation from UNC matlab code # - eigen vectors are the columns of ``pc`` self._log.debug('-- computing linalg.eig') l, pc = numpy.linalg.eig(c) self._log.debug('-- ordering eigen vectors by descending eigen ' 'value') # # Harry translation of original matlab code # # - Uses singular values / vectors, not eigen # # - singular vectors are the columns of pc # self._log.debug('-- computing linalg.svd') # pc, l, _ = numpy.linalg.svd(c) # self._log.debug('-- ordering singular vectors by descending ' # 'singular value') # Same ordering method for both eig/svd sources. l_pc_ordered = sorted(zip(l, pc.transpose()), key=lambda _p: _p[0], reverse=True) self._log.debug("-- top vector extraction") # Only keep the top ``bit_length`` vectors after ordering by descending # value magnitude. # - Transposing vectors back to column-vectors. pc_top = numpy.array([p[1] for p in l_pc_ordered[:self.bit_length]])\ .transpose() self._log.debug("-- project centered data by PC matrix") v = numpy.dot(x, pc_top) self._log.info("Performing ITQ to find optimal rotation") c, self.rotation = self._find_itq_rotation(v, self.itq_iterations) # De-adjust rotation with PC vector self.rotation = numpy.dot(pc_top, self.rotation) self.save_model() return c
def compute_descriptor_async(self, data_iter, descr_factory=DFLT_DESCRIPTOR_FACTORY, overwrite=False, procs=None, **kwds): """ Asynchronously compute feature data for multiple data items. :param data_iter: Iterable of data elements to compute features for. These must have UIDs assigned for feature association in return value. :type data_iter: collections.Iterable[smqtk.representation.DataElement] :param descr_factory: Factory instance to produce the wrapping descriptor element instance. The default factory produces ``DescriptorMemoryElement`` instances by default. :type descr_factory: smqtk.representation.DescriptorElementFactory :param overwrite: Whether or not to force re-computation of a descriptor vectors for the given data even when there exists precomputed vectors in the generated DescriptorElements as generated from the provided factory. This will overwrite the persistently stored vectors if the provided factory produces a DescriptorElement implementation such storage. :type overwrite: bool :param procs: Optional specification of how many processors to use when pooling sub-tasks. If None, we attempt to use all available cores. :type procs: int | None :raises ValueError: An input DataElement was of a content type that we cannot handle. :return: Mapping of input DataElement UUIDs to the computed descriptor element for that data. DescriptorElement UUID's are congruent with the UUID of the data element it is the descriptor of. :rtype: dict[collections.Hashable, smqtk.representation.DescriptorElement] """ self._set_caffe_mode() # Create DescriptorElement instances for each data elem. data_elements = {} descr_elements = {} self._log.debug("Checking content types; aggregating data/descriptor " "elements.") pr = ProgressReporter(self._log.debug, 1.0).start() for data in data_iter: ct = data.content_type() if ct not in self.valid_content_types(): self._log.error("Cannot compute descriptor from content type " "'%s' data: %s)" % (ct, data)) raise ValueError("Cannot compute descriptor from content type " "'%s' data: %s)" % (ct, data)) data_elements[data.uuid()] = data descr_elements[data.uuid()] = \ descr_factory.new_descriptor(self.name, data.uuid()) pr.increment_report() pr.report() self._log.debug("Given %d unique data elements", len(data_elements)) # Reduce procs down to the number of elements to process if its smaller if len(data_elements) < (procs or multiprocessing.cpu_count()): procs = len(data_elements) if procs == 0: raise ValueError("No data elements provided") # For thread safely, only use .append() and .popleft() (queue) uuid4proc = deque() def check_get_uuid(descriptor_elem): if overwrite or not descriptor_elem.has_vector(): uuid4proc.append(descriptor_elem.uuid()) # Using thread-pool due to in-line function + updating local deque p = multiprocessing.pool.ThreadPool(procs) try: p.map(check_get_uuid, six.itervalues(descr_elements)) finally: p.close() p.join() del p self._log.debug("%d descriptors already computed", len(data_elements) - len(uuid4proc)) if uuid4proc: self._log.debug("Converting deque to tuple for segmentation") uuid4proc = tuple(uuid4proc) # Split UUIDs into groups equal to our batch size, and an option # tail group that is less than our batch size. tail_size = len(uuid4proc) % self.batch_size batch_groups = (len(uuid4proc) - tail_size) // self.batch_size self._log.debug("Processing %d batches of size %d", batch_groups, self.batch_size) if tail_size: self._log.debug("Processing tail group of size %d", tail_size) if batch_groups: for g in range(batch_groups): self._log.debug("Starting batch: %d of %d", g + 1, batch_groups) batch_uuids = \ uuid4proc[g * self.batch_size:(g + 1) * self.batch_size] self._process_batch(batch_uuids, data_elements, descr_elements, procs, kwds.get('use_mp', True)) if tail_size: batch_uuids = uuid4proc[-tail_size:] self._log.debug("Starting tail batch (size=%d)", len(batch_uuids)) self._process_batch(batch_uuids, data_elements, descr_elements, procs, kwds.get('use_mp', True)) self._log.debug("forming output dict") return dict((data_elements[k].uuid(), descr_elements[k]) for k in data_elements)
except elasticsearch.ConnectionTimeout, ex: log.warning("ElasticSearch timed out (error = %s)", str(ex)) restart = True log.debug("Restarting query from index %d", i) log.info("Initializing image download/record parallel iterator") img_dl_records = parallel_map(dl_image, iter_scan_meta(), name='image_download', use_multiprocessing=True, cores=cores) # Write out log.info("Starting iteration/file-write") with open(scan_record, 'w') as record_file: pr = ProgressReporter(log.debug, 1.0).start() for r in img_dl_records: if r is not None: cdr_id, local_path, uuid = r record_file.write('%s,%s,%s\n' % (cdr_id, local_path, uuid)) pr.increment_report() pr.report() def default_config(): return { "image_types": ['jpeg', 'png', 'tiff'], "elastic_search": { "instance_address": "CHANGEME", "index": "CHANGEME", "username": "******",