def default_config(): return { "descriptor_generator": make_default_config(DescriptorGenerator.get_impls()), "descriptor_factory": DescriptorElementFactory.get_default_config(), "descriptor_set": make_default_config(DescriptorSet.get_impls()), "optional_data_set": make_default_config(DataSet.get_impls()) }
def default_config(): return { 'tool': { 'girder_api_root': 'http://localhost:8080/api/v1', 'api_key': None, 'api_query_batch': 1000, 'dataset_insert_batch_size': None, }, 'plugins': { 'data_set': make_default_config(DataSet.get_impls()), } }
def get_default_config(cls): d = super(IqrSearch, cls).get_default_config() # Remove parent_app slot for later explicit specification. del d['parent_app'] d['iqr_service_url'] = None # fill in plugin configs d['data_set'] = make_default_config(DataSet.get_impls()) return d
def main(): args = cli_parser().parse_args() config = cli.utility_main_helper(default_config, args) log = logging.getLogger(__name__) api_root = config['tool']['girder_api_root'] api_key = config['tool']['api_key'] api_query_batch = config['tool']['api_query_batch'] insert_batch_size = config['tool']['dataset_insert_batch_size'] # Collect N folder/item/file references on CL and any files referenced. #: :type: list[str] ids_folder = args.folder #: :type: list[str] ids_item = args.item #: :type: list[str] ids_file = args.file if args.folder_list: with open(args.folder_list) as f: ids_folder.extend([fid.strip() for fid in f]) if args.item_list: with open(args.item_list) as f: ids_item.extend([iid.strip() for iid in f]) if args.file_list: with open(args.file_list) as f: ids_file.extend([fid.strip() for fid in f]) #: :type: smqtk.representation.DataSet data_set = from_config_dict(config['plugins']['data_set'], DataSet.get_impls()) batch = collections.deque() pr = cli.ProgressReporter(log.info, 1.0).start() for e in find_girder_files(api_root, ids_folder, ids_item, ids_file, api_key, api_query_batch): batch.append(e) if insert_batch_size and len(batch) >= insert_batch_size: data_set.add_data(*batch) batch.clear() pr.increment_report() pr.report() if batch: data_set.add_data(*batch) log.info('Done')
def main(): parser = cli_parser() args = parser.parse_args() config = cli.utility_main_helper(default_config, args) log = logging.getLogger(__name__) log.debug("Script arguments:\n%s" % args) def iter_input_elements(): for f in args.input_files: f = osp.expanduser(f) if osp.isfile(f): yield DataFileElement(f) else: log.debug("Expanding glob: %s" % f) for g in glob.glob(f): yield DataFileElement(g) log.info("Adding elements to data set") ds = from_config_dict(config['data_set'], DataSet.get_impls()) ds.add_data(*iter_input_elements())
def from_config(cls, config, parent_app): """ Instantiate a new instance of this class given the configuration JSON-compliant dictionary encapsulating initialization arguments. :param config: JSON compliant dictionary encapsulating a configuration. :type config: dict :param parent_app: Parent containing flask app instance :type parent_app: smqtk.web.search_app.app.search_app :return: Constructed instance from the provided config. :rtype: IqrSearch """ merged = cls.get_default_config() merged.update(config) # construct nested objects via configurations merged['data_set'] = \ from_config_dict(merged['data_set'], DataSet.get_impls()) return cls(parent_app, **merged)
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None, check_image=False): """ Top level function handling configuration and inputs/outputs. :param c: Configuration dictionary (JSON) :type c: dict :param filelist_filepath: Path to a text file that lists paths to data files, separated by new lines. :type filelist_filepath: str :param checkpoint_filepath: Output file to which we write input filepath to SHA1 (UUID) relationships. :type checkpoint_filepath: :param batch_size: Optional batch size (None default) of data elements to process / descriptors to compute at a time. This causes files and stores to be written to incrementally during processing instead of one single batch transaction at a time. :type batch_size: :param check_image: Enable checking image loading from file before queueing that file for processing. If the check fails, the file is skipped instead of a halting exception being raised. :type check_image: bool """ log = logging.getLogger(__name__) file_paths = [line.strip() for line in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor index") descriptor_set = cast( DescriptorSet, from_config_dict(c['descriptor_set'], DescriptorSet.get_impls())) # ``data_set`` added to within the ``iter_valid_elements`` function. data_set: Optional[DataSet] = None if c['optional_data_set']['type'] is None: log.info("Not saving loaded data elements to data set") else: log.info("Initializing data set to append to") data_set = cast( DataSet, from_config_dict(c['optional_data_set'], DataSet.get_impls())) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) generator = cast( DescriptorGenerator, from_config_dict(c['descriptor_generator'], DescriptorGenerator.get_impls())) def iter_valid_elements(): def is_valid(file_path): e = DataFileElement(file_path) if is_valid_element( e, valid_content_types=generator.valid_content_types(), check_image=check_image): return e else: return False data_elements: Deque[DataFileElement] = collections.deque() valid_files_filter = parallel.parallel_map(is_valid, file_paths, name="check-file-type", use_multiprocessing=True) for dfe in valid_files_filter: if dfe: yield dfe if data_set is not None: data_elements.append(dfe) if batch_size and len(data_elements) == batch_size: log.debug( "Adding data element batch to set (size: %d)", len(data_elements)) data_set.add_data(*data_elements) data_elements.clear() # elements only collected if we have a data-set configured, so add any # still in the deque to the set if data_set is not None and data_elements: log.debug("Adding data elements to set (size: %d", len(data_elements)) data_set.add_data(*data_elements) log.info("Computing descriptors") m = compute_many_descriptors( iter_valid_elements(), generator, factory, descriptor_set, batch_size=batch_size, ) # Recording computed file paths and associated file UUIDs (SHA1) cf = open(checkpoint_filepath, 'w') cf_writer = csv.writer(cf) try: pr = ProgressReporter(log.debug, 1.0).start() for de, descr in m: # We know that we are using DataFileElements going into the # compute_many_descriptors, so we can assume that's what comes out # of it as well. # noinspection PyProtectedMember cf_writer.writerow([de._filepath, descr.uuid()]) pr.increment_report() pr.report() finally: del cf_writer cf.close() log.info("Done")
def default_config(): return { "data_set": make_default_config(DataSet.get_impls()) }