def get_default_config(cls): """ Generate and return a default configuration dictionary for this class. This will be primarily used for generating what the configuration dictionary would look like for this class without instantiating it. :return: Default configuration dictionary for the class. :rtype: dict """ c = super(NearestNeighborServiceServer, cls).get_default_config() merge_dict( c, { "descriptor_factory": DescriptorElementFactory.get_default_config(), "descriptor_generator": plugin.make_config(get_descriptor_generator_impls()), "nn_index": plugin.make_config(get_nn_index_impls()), "descriptor_index": plugin.make_config(get_descriptor_index_impls()), "update_descriptor_index": False, }) return c
def get_default_config(cls): c = super(SmqtkClassifierService, cls).get_default_config() c[cls.CONFIG_ENABLE_CLASSIFIER_REMOVAL] = False # Static classifier configurations c[cls.CONFIG_CLASSIFIER_COLLECTION] = \ ClassifierCollection.get_default_config() # Classification element factory for new classification results. c[cls.CONFIG_CLASSIFICATION_FACTORY] = \ ClassificationElementFactory.get_default_config() # Descriptor generator for new content c[cls.CONFIG_DESCRIPTOR_GENERATOR] = smqtk.utils.plugin.make_config( get_descriptor_generator_impls() ) # Descriptor factory for new content descriptors c[cls.CONFIG_DESCRIPTOR_FACTORY] = \ DescriptorElementFactory.get_default_config() # from-IQR-state *supervised* classifier configuration c[cls.CONFIG_IQR_CLASSIFIER] = smqtk.utils.plugin.make_config( get_classifier_impls( sub_interface=SupervisedClassifier ) ) c[cls.CONFIG_IMMUTABLE_LABELS] = [] return c
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None, check_image=False): """ Top level function handling configuration and inputs/outputs. :param c: Configuration dictionary (JSON) :type c: dict :param filelist_filepath: Path to a text file that lists paths to data files, separated by new lines. :type filelist_filepath: str :param checkpoint_filepath: Output file to which we write input filepath to SHA1 (UUID) relationships. :type checkpoint_filepath: :param batch_size: Optional batch size (None default) of data elements to process / descriptors to compute at a time. This causes files and stores to be written to incrementally during processing instead of one single batch transaction at a time. :type batch_size: :param check_image: Enable checking image loading from file before queueing that file for processing. If the check fails, the file is skipped instead of a halting exception being raised. :type check_image: bool """ log = logging.getLogger(__name__) file_paths = [l.strip() for l in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config(c['descriptor_index'], get_descriptor_index_impls()) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) #: :type: smqtk.algorithms.DescriptorGenerator generator = plugin.from_plugin_config(c['descriptor_generator'], get_descriptor_generator_impls()) def test_image_load(dfe): try: PIL.Image.open(io.BytesIO(dfe.get_bytes())) return True except IOError, ex: # noinspection PyProtectedMember log.warn( "Failed to convert '%s' bytes into an image " "(error: %s). Skipping", dfe._filepath, str(ex)) return False
def default_config(): return { "descriptor_generator": plugin.make_config(get_descriptor_generator_impls()), "descriptor_factory": DescriptorElementFactory.get_default_config(), "descriptor_index": plugin.make_config(get_descriptor_index_impls()) }
def get_default_config(): return { "descriptor_factory": DescriptorElementFactory.get_default_config(), "descriptor_generator": plugin.make_config(get_descriptor_generator_impls()), "classification_factory": ClassificationElementFactory.get_default_config(), "classifier": plugin.make_config(get_classifier_impls()), }
def get_default_config(): return { "descriptor_factory": DescriptorElementFactory.get_default_config(), "descriptor_generator": plugin.make_config(get_descriptor_generator_impls()), "classification_factory": ClassificationElementFactory.get_default_config(), "classifier": plugin.make_config(get_classifier_impls()), }
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None, check_image=False): """ Top level function handling configuration and inputs/outputs. :param c: Configuration dictionary (JSON) :type c: dict :param filelist_filepath: Path to a text file that lists paths to data files, separated by new lines. :type filelist_filepath: str :param checkpoint_filepath: Output file to which we write input filepath to SHA1 (UUID) relationships. :type checkpoint_filepath: :param batch_size: Optional batch size (None default) of data elements to process / descriptors to compute at a time. This causes files and stores to be written to incrementally during processing instead of one single batch transaction at a time. :type batch_size: """ log = logging.getLogger(__name__) file_paths = [l.strip() for l in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config(c['descriptor_index'], get_descriptor_index_impls()) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) #: :type: smqtk.algorithms.DescriptorGenerator generator = plugin.from_plugin_config(c['descriptor_generator'], get_descriptor_generator_impls()) def test_image_load(dfe): try: PIL.Image.open(io.BytesIO(dfe.get_bytes())) return True except IOError, ex: # noinspection PyProtectedMember log.warn("Failed to convert '%s' bytes into an image " "(error: %s). Skipping", dfe._filepath, str(ex)) return False
def get_default_config(cls): """ Generate and return a default configuration dictionary for this class. This will be primarily used for generating what the configuration dictionary would look like for this class without instantiating it. :return: Default configuration dictionary for the class. :rtype: dict """ c = super(NearestNeighborServiceServer, cls).get_default_config() merge_dict(c, { "descriptor_factory": DescriptorElementFactory.get_default_config(), "descriptor_generator": plugin.make_config(get_descriptor_generator_impls()), "nn_index": plugin.make_config(get_nn_index_impls()), "descriptor_index": plugin.make_config(get_descriptor_index_impls()), "update_descriptor_index": False, }) return c
def classify_files(config, label, file_globs): log = logging.getLogger(__name__) #: :type: smqtk.algorithms.Classifier classifier = \ plugin.from_plugin_config(config['classifier'], get_classifier_impls()) def log_avaialable_labels(): log.info("Available classifier labels:") for l in classifier.get_labels(): log.info("- %s", l) if label is None: log_avaialable_labels() return elif label not in classifier.get_labels(): log.error( "Invalid classification label provided to compute and filter " "on: '%s'", label) log_avaialable_labels() return log.info("Collecting files from globs") #: :type: list[DataFileElement] data_elements = [] uuid2filepath = {} for g in file_globs: if os.path.isfile(g): d = DataFileElement(g) data_elements.append(d) uuid2filepath[d.uuid()] = g else: log.debug("expanding glob: %s", g) for fp in glob.iglob(g): d = DataFileElement(fp) data_elements.append(d) uuid2filepath[d.uuid()] = fp if not data_elements: raise RuntimeError("No files provided for classification.") log.info("Computing descriptors") descriptor_factory = \ DescriptorElementFactory.from_config(config['descriptor_factory']) #: :type: smqtk.algorithms.DescriptorGenerator descriptor_generator = \ plugin.from_plugin_config(config['descriptor_generator'], get_descriptor_generator_impls()) descr_map = descriptor_generator\ .compute_descriptor_async(data_elements, descriptor_factory) log.info("Classifying descriptors") classification_factory = ClassificationElementFactory \ .from_config(config['classification_factory']) classification_map = classifier\ .classify_async(descr_map.values(), classification_factory) log.info("Printing input file paths that classified as the given label.") # map of UUID to filepath: uuid2c = dict((c.uuid, c) for c in classification_map.itervalues()) for data in data_elements: if uuid2c[data.uuid()].max_label() == label: print uuid2filepath[data.uuid()]
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(NearestNeighborServiceServer, self).__init__(json_config) self.update_index = json_config['update_descriptor_index'] # Descriptor factory setup self._log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory']) #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = None if self.update_index: self._log.info("Initializing DescriptorIndex to update") #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = plugin.from_plugin_config( json_config['descriptor_index'], get_descriptor_index_impls()) #: :type: smqtk.algorithms.NearestNeighborsIndex self.nn_index = plugin.from_plugin_config(json_config['nn_index'], get_nn_index_impls()) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_generator_inst = plugin.from_plugin_config( self.json_config['descriptor_generator'], get_descriptor_generator_impls()) @self.route("/count", methods=['GET']) def count(): """ Return the number of elements represented in this index. """ return flask.jsonify(**{ "count": self.nn_index.count(), }) @self.route("/compute/<path:uri>", methods=["POST"]) def compute(uri): """ Compute the descriptor for a URI specified data element using the configured descriptor generator. See ``compute_nearest_neighbors`` method docstring for URI specifications accepted. If the a descriptor index was configured and update was turned on, we add the computed descriptor to the index. JSON Return format:: { "success": <bool> "message": <str> "descriptor": <None|list[float]> "reference_uri": <str> } :param uri: URI data specification. """ descriptor = None try: descriptor = self.generate_descriptor_for_uri(uri) message = "Descriptor generated" descriptor = list(map(float, descriptor.vector())) except ValueError as ex: message = "Input value issue: %s" % str(ex) except RuntimeError as ex: message = "Descriptor extraction failure: %s" % str(ex) return flask.jsonify( success=descriptor is not None, message=message, descriptor=descriptor, reference_uri=uri, ) @self.route("/nn/<path:uri>") @self.route("/nn/n=<int:n>/<path:uri>") @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>") def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None): """ Data modes for upload/use: - local filepath - base64 - http/s URL - existing data/descriptor UUID The following sub-sections detail how different URI's can be used. Local Filepath -------------- The URI string must be prefixed with ``file://``, followed by the full path to the data file to describe. Base 64 data ------------ The URI string must be prefixed with "base64://", followed by the base64 encoded string. This mode also requires an additional ``?content_type=`` to provide data content type information. This mode saves the encoded data to temporary file for processing. HTTP/S address -------------- This is the default mode when the URI prefix is none of the above. This uses the requests module to locally download a data file for processing. Existing Data/Descriptor by UUID -------------------------------- When given a uri prefixed with "uuid://", we interpret the remainder of the uri as the UUID of a descriptor already present in the configured descriptor index. If the given UUID is not present in the index, a KeyError is raised. JSON Return format ------------------ { "success": <bool> "message": <str> "neighbors": <None|list[float]> "reference_uri": <str> } :param n: Number of neighbors to query for :param start_i: The starting index of the neighbor vectors to slice into for return. :param end_i: The ending index of the neighbor vectors to slice into for return. :type uri: str """ descriptor = None try: descriptor = self.generate_descriptor_for_uri(uri) message = "descriptor computed" except ValueError as ex: message = "Input data issue: %s" % str(ex) except RuntimeError as ex: message = "Descriptor generation failure: %s" % str(ex) # Base pagination slicing based on provided start and end indices, # otherwise clamp to beginning/ending of queried neighbor sequence. page_slice = slice(start_i or 0, end_i or n) neighbors = [] dists = [] if descriptor is not None: try: neighbors, dists = \ self.nn_index.nn(descriptor, n) except ValueError as ex: message = "Descriptor or index related issue: %s" % str(ex) # TODO: Return the optional descriptor vectors for the neighbors # noinspection PyTypeChecker d = { "success": bool(descriptor is not None), "message": message, "neighbors": [n.uuid() for n in neighbors[page_slice]], "distances": dists[page_slice], "reference_uri": uri } return flask.jsonify(d)
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(NearestNeighborServiceServer, self).__init__(json_config) self.update_index = json_config['update_descriptor_index'] # Descriptor factory setup self._log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory'] ) #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = None if self.update_index: self._log.info("Initializing DescriptorIndex to update") #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = plugin.from_plugin_config( json_config['descriptor_index'], get_descriptor_index_impls() ) #: :type: smqtk.algorithms.NearestNeighborsIndex self.nn_index = plugin.from_plugin_config( json_config['nn_index'], get_nn_index_impls() ) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_generator_inst = plugin.from_plugin_config( self.json_config['descriptor_generator'], get_descriptor_generator_impls() ) @self.route("/count", methods=['GET']) def count(): """ Return the number of elements represented in this index. """ return flask.jsonify(**{ "count": self.nn_index.count(), }) @self.route("/compute/<path:uri>", methods=["POST"]) def compute(uri): """ Compute the descriptor for a URI specified data element using the configured descriptor generator. If the a descriptor index was configured and update was turned on, we add the computed descriptor to the index. JSON Return format:: { "success": <bool> "message": <str> "descriptor": <None|list[float]> "reference_uri": <str> } :param uri: URI data specification. """ descriptor = None try: _, descriptor = self.generate_descriptor_for_uri(uri) message = "Descriptor generated" descriptor = map(float, descriptor.vector()) except ValueError, ex: message = "Input value issue: %s" % str(ex) except RuntimeError, ex: message = "Descriptor extraction failure: %s" % str(ex)
def main(): parser = cli_parser() args = parser.parse_args() # # Setup logging # if not logging.getLogger().handlers: if args.verbose: bin_utils.initialize_logging(logging.getLogger(), logging.DEBUG) else: bin_utils.initialize_logging(logging.getLogger(), logging.INFO) log = logging.getLogger("smqtk.scripts.iqr_app_model_generation") search_app_config = json.loads(jsmin.jsmin(open(args.config).read())) # # Input parameters # # The following dictionaries are JSON configurations that are used to # configure the various data structures and algorithms needed for the IQR demo # application. Values here can be changed to suit your specific data and # algorithm needs. # # See algorithm implementation doc-strings for more information on configuration # parameters (see implementation class ``__init__`` method). # # base actions on a specific IQR tab configuration (choose index here) if args.tab < 0 or args.tab > (len(search_app_config["iqr_tabs"]) - 1): log.error("Invalid tab number provided.") exit(1) search_app_iqr_config = search_app_config["iqr_tabs"][args.tab] # Configure DataSet implementation and parameters data_set_config = search_app_iqr_config['data_set'] # Configure DescriptorGenerator algorithm implementation, parameters and # persistant model component locations (if implementation has any). descriptor_generator_config = search_app_iqr_config['descr_generator'] # Configure NearestNeighborIndex algorithm implementation, parameters and # persistant model component locations (if implementation has any). nn_index_config = search_app_iqr_config['nn_index'] # Configure RelevancyIndex algorithm implementation, parameters and # persistant model component locations (if implementation has any). # # The LibSvmHikRelevancyIndex implementation doesn't actually build a persistant # model (or doesn't have to that is), but we're leaving this block here in # anticipation of other potential implementations in the future. # rel_index_config = search_app_iqr_config['rel_index_config'] # Configure DescriptorElementFactory instance, which defines what implementation # of DescriptorElement to use for storing generated descriptor vectors below. descriptor_elem_factory_config = search_app_iqr_config[ 'descriptor_factory'] # # Initialize data/algorithms # # Constructing appropriate data structures and algorithms, needed for the IQR # demo application, in preparation for model training. # descriptor_elem_factory = \ representation.DescriptorElementFactory \ .from_config(descriptor_elem_factory_config) #: :type: representation.DataSet data_set = \ plugin.from_plugin_config(data_set_config, representation.get_data_set_impls()) #: :type: algorithms.DescriptorGenerator descriptor_generator = \ plugin.from_plugin_config(descriptor_generator_config, algorithms.get_descriptor_generator_impls()) #: :type: algorithms.NearestNeighborsIndex nn_index = \ plugin.from_plugin_config(nn_index_config, algorithms.get_nn_index_impls()) #: :type: algorithms.RelevancyIndex rel_index = \ plugin.from_plugin_config(rel_index_config, algorithms.get_relevancy_index_impls()) # # Build models # # Perform the actual building of the models. # # Add data files to DataSet DataFileElement = representation.get_data_element_impls( )["DataFileElement"] for fp in args.input_files: fp = osp.expanduser(fp) if osp.isfile(fp): data_set.add_data(DataFileElement(fp)) else: log.debug("Expanding glob: %s" % fp) for g in glob.iglob(fp): data_set.add_data(DataFileElement(g)) # Generate a mode if the generator defines a known generation method. if hasattr(descriptor_generator, "generate_model"): descriptor_generator.generate_model(data_set) # Add other if-else cases for other known implementation-specific generation # methods stubs # Generate descriptors of data for building NN index. data2descriptor = descriptor_generator.compute_descriptor_async( data_set, descriptor_elem_factory) try: nn_index.build_index(six.itervalues(data2descriptor)) except RuntimeError: # Already built model, so skipping this step pass rel_index.build_index(six.itervalues(data2descriptor))
def main(): args = cli_parser().parse_args() ui_config_filepath, iqr_config_filepath = args.config llevel = logging.DEBUG if args.verbose else logging.INFO tab = args.tab input_files_globs = args.input_files # Not using `bin_utils.utility_main_helper`` due to deviating from single- # config-with-default usage. bin_utils.initialize_logging(logging.getLogger('smqtk'), llevel) bin_utils.initialize_logging(logging.getLogger('__main__'), llevel) log = logging.getLogger(__name__) log.info("Loading UI config: '{}'".format(ui_config_filepath)) ui_config, ui_config_loaded = bin_utils.load_config(ui_config_filepath) log.info("Loading IQR config: '{}'".format(iqr_config_filepath)) iqr_config, iqr_config_loaded = bin_utils.load_config(iqr_config_filepath) if not (ui_config_loaded and iqr_config_loaded): raise RuntimeError("One or both configuration files failed to load.") # Ensure the given "tab" exists in UI configuration. if tab is None: log.error("No configuration tab provided to drive model generation.") exit(1) if tab not in ui_config["iqr_tabs"]: log.error("Invalid tab provided: '{}'. Available tags: {}" .format(tab, list(ui_config["iqr_tabs"]))) exit(1) # # Gather Configurations # log.info("Extracting plugin configurations") ui_tab_config = ui_config["iqr_tabs"][tab] iqr_plugins_config = iqr_config['iqr_service']['plugins'] # Configure DataSet implementation and parameters data_set_config = ui_tab_config['data_set'] # Configure DescriptorElementFactory instance, which defines what # implementation of DescriptorElement to use for storing generated # descriptor vectors below. descriptor_elem_factory_config = iqr_plugins_config['descriptor_factory'] # Configure DescriptorGenerator algorithm implementation, parameters and # persistent model component locations (if implementation has any). descriptor_generator_config = iqr_plugins_config['descriptor_generator'] # Configure NearestNeighborIndex algorithm implementation, parameters and # persistent model component locations (if implementation has any). nn_index_config = iqr_plugins_config['neighbor_index'] # # Initialize data/algorithms # # Constructing appropriate data structures and algorithms, needed for the # IQR demo application, in preparation for model training. # log.info("Instantiating plugins") #: :type: representation.DataSet data_set = \ plugin.from_plugin_config(data_set_config, representation.get_data_set_impls()) descriptor_elem_factory = \ representation.DescriptorElementFactory \ .from_config(descriptor_elem_factory_config) #: :type: algorithms.DescriptorGenerator descriptor_generator = \ plugin.from_plugin_config(descriptor_generator_config, algorithms.get_descriptor_generator_impls()) #: :type: algorithms.NearestNeighborsIndex nn_index = \ plugin.from_plugin_config(nn_index_config, algorithms.get_nn_index_impls()) # # Build models # log.info("Adding files to dataset '{}'".format(data_set)) for g in input_files_globs: g = osp.expanduser(g) if osp.isfile(g): data_set.add_data(DataFileElement(g, readonly=True)) else: log.debug("Expanding glob: %s" % g) for fp in glob.iglob(g): data_set.add_data(DataFileElement(fp, readonly=True)) # Generate a model if the generator defines a known generation method. try: log.debug("descriptor generator as model to generate?") descriptor_generator.generate_model(data_set) except AttributeError as ex: log.debug("descriptor generator as model to generate - Nope: {}" .format(str(ex))) # Generate descriptors of data for building NN index. log.info("Computing descriptors for data set with {}" .format(descriptor_generator)) data2descriptor = descriptor_generator.compute_descriptor_async( data_set, descriptor_elem_factory ) # Possible additional support steps before building NNIndex try: # Fit the LSH index functor log.debug("Has LSH Functor to fit?") nn_index.lsh_functor.fit(six.itervalues(data2descriptor)) except AttributeError as ex: log.debug("Has LSH Functor to fit - Nope: {}".format(str(ex))) log.info("Building nearest neighbors index {}".format(nn_index)) nn_index.build_index(six.itervalues(data2descriptor))
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(NearestNeighborServiceServer, self).__init__(json_config) self.update_index = json_config['update_descriptor_index'] # Descriptor factory setup self._log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory']) #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = None if self.update_index: self._log.info("Initializing DescriptorIndex to update") #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = plugin.from_plugin_config( json_config['descriptor_index'], get_descriptor_index_impls()) #: :type: smqtk.algorithms.NearestNeighborsIndex self.nn_index = plugin.from_plugin_config(json_config['nn_index'], get_nn_index_impls()) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_generator_inst = plugin.from_plugin_config( self.json_config['descriptor_generator'], get_descriptor_generator_impls()) @self.route("/count", methods=['GET']) def count(): """ Return the number of elements represented in this index. """ return flask.jsonify(**{ "count": self.nn_index.count(), }) @self.route("/compute/<path:uri>", methods=["POST"]) def compute(uri): """ Compute the descriptor for a URI specified data element using the configured descriptor generator. If the a descriptor index was configured and update was turned on, we add the computed descriptor to the index. JSON Return format:: { "success": <bool> "message": <str> "descriptor": <None|list[float]> "reference_uri": <str> } :param uri: URI data specification. """ descriptor = None try: _, descriptor = self.generate_descriptor_for_uri(uri) message = "Descriptor generated" descriptor = map(float, descriptor.vector()) except ValueError, ex: message = "Input value issue: %s" % str(ex) except RuntimeError, ex: message = "Descriptor extraction failure: %s" % str(ex)
def main(): parser = cli_parser() args = parser.parse_args() # # Setup logging # if not logging.getLogger().handlers: if args.verbose: bin_utils.initialize_logging(logging.getLogger(), logging.DEBUG) else: bin_utils.initialize_logging(logging.getLogger(), logging.INFO) log = logging.getLogger("smqtk.scripts.iqr_app_model_generation") search_app_config = json.loads(jsmin.jsmin(open(args.config).read())) # # Input parameters # # The following dictionaries are JSON configurations that are used to # configure the various data structures and algorithms needed for the IQR demo # application. Values here can be changed to suit your specific data and # algorithm needs. # # See algorithm implementation doc-strings for more information on configuration # parameters (see implementation class ``__init__`` method). # # base actions on a specific IQR tab configuration (choose index here) if args.tab < 0 or args.tab > (len(search_app_config["iqr_tabs"]) - 1): log.error("Invalid tab number provided.") exit(1) search_app_iqr_config = search_app_config["iqr_tabs"][args.tab] # Configure DataSet implementation and parameters data_set_config = search_app_iqr_config['data_set'] # Configure DescriptorGenerator algorithm implementation, parameters and # persistant model component locations (if implementation has any). descriptor_generator_config = search_app_iqr_config['descr_generator'] # Configure NearestNeighborIndex algorithm implementation, parameters and # persistant model component locations (if implementation has any). nn_index_config = search_app_iqr_config['nn_index'] # Configure RelevancyIndex algorithm implementation, parameters and # persistant model component locations (if implementation has any). # # The LibSvmHikRelevancyIndex implementation doesn't actually build a persistant # model (or doesn't have to that is), but we're leaving this block here in # anticipation of other potential implementations in the future. # rel_index_config = search_app_iqr_config['rel_index_config'] # Configure DescriptorElementFactory instance, which defines what implementation # of DescriptorElement to use for storing generated descriptor vectors below. descriptor_elem_factory_config = search_app_iqr_config['descriptor_factory'] # # Initialize data/algorithms # # Constructing appropriate data structures and algorithms, needed for the IQR # demo application, in preparation for model training. # descriptor_elem_factory = \ representation.DescriptorElementFactory \ .from_config(descriptor_elem_factory_config) #: :type: representation.DataSet data_set = \ plugin.from_plugin_config(data_set_config, representation.get_data_set_impls()) #: :type: algorithms.DescriptorGenerator descriptor_generator = \ plugin.from_plugin_config(descriptor_generator_config, algorithms.get_descriptor_generator_impls()) #: :type: algorithms.NearestNeighborsIndex nn_index = \ plugin.from_plugin_config(nn_index_config, algorithms.get_nn_index_impls()) #: :type: algorithms.RelevancyIndex rel_index = \ plugin.from_plugin_config(rel_index_config, algorithms.get_relevancy_index_impls()) # # Build models # # Perform the actual building of the models. # # Add data files to DataSet DataFileElement = representation.get_data_element_impls()["DataFileElement"] for fp in args.input_files: fp = osp.expanduser(fp) if osp.isfile(fp): data_set.add_data(DataFileElement(fp)) else: log.debug("Expanding glob: %s" % fp) for g in glob.iglob(fp): data_set.add_data(DataFileElement(g)) # Generate a mode if the generator defines a known generation method. if hasattr(descriptor_generator, "generate_model"): descriptor_generator.generate_model(data_set) # Add other if-else cases for other known implementation-specific generation # methods stubs # Generate descriptors of data for building NN index. data2descriptor = descriptor_generator.compute_descriptor_async( data_set, descriptor_elem_factory ) try: nn_index.build_index(data2descriptor.itervalues()) except RuntimeError: # Already built model, so skipping this step pass rel_index.build_index(data2descriptor.itervalues())
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None, check_image=False): """ Top level function handling configuration and inputs/outputs. :param c: Configuration dictionary (JSON) :type c: dict :param filelist_filepath: Path to a text file that lists paths to data files, separated by new lines. :type filelist_filepath: str :param checkpoint_filepath: Output file to which we write input filepath to SHA1 (UUID) relationships. :type checkpoint_filepath: :param batch_size: Optional batch size (None default) of data elements to process / descriptors to compute at a time. This causes files and stores to be written to incrementally during processing instead of one single batch transaction at a time. :type batch_size: :param check_image: Enable checking image loading from file before queueing that file for processing. If the check fails, the file is skipped instead of a halting exception being raised. :type check_image: bool """ log = logging.getLogger(__name__) file_paths = [l.strip() for l in open(filelist_filepath)] log.info("Making descriptor factory") factory = DescriptorElementFactory.from_config(c['descriptor_factory']) log.info("Making descriptor index") #: :type: smqtk.representation.DescriptorIndex descriptor_index = plugin.from_plugin_config(c['descriptor_index'], get_descriptor_index_impls()) data_set = None if c['optional_data_set']['type'] is None: log.info("Not saving loaded data elements to data set") else: log.info("Initializing data set to append to") #: :type: smqtk.representation.DataSet data_set = plugin.from_plugin_config(c['optional_data_set'], get_data_set_impls()) log.info("Making descriptor generator '%s'", c['descriptor_generator']['type']) #: :type: smqtk.algorithms.DescriptorGenerator generator = plugin.from_plugin_config(c['descriptor_generator'], get_descriptor_generator_impls()) def iter_valid_elements(): def is_valid(file_path): dfe = DataFileElement(file_path) if is_valid_element( dfe, valid_content_types=generator.valid_content_types(), check_image=check_image): return dfe else: return False data_elements = collections.deque() valid_files_filter = parallel.parallel_map(is_valid, file_paths, name="check-file-type", use_multiprocessing=True) for dfe in valid_files_filter: if dfe: yield dfe if data_set is not None: data_elements.append(dfe) if batch_size and len(data_elements) == batch_size: log.debug( "Adding data element batch to set (size: %d)", len(data_elements)) data_set.add_data(*data_elements) data_elements.clear() # elements only collected if we have a data-set configured, so add any # still in the deque to the set if data_elements: log.debug("Adding data elements to set (size: %d", len(data_elements)) data_set.add_data(*data_elements) log.info("Computing descriptors") m = compute_many_descriptors( iter_valid_elements(), generator, factory, descriptor_index, batch_size=batch_size, ) # Recording computed file paths and associated file UUIDs (SHA1) cf = open(checkpoint_filepath, 'w') cf_writer = csv.writer(cf) try: rps = [0] * 7 for fp, descr in m: cf_writer.writerow([fp, descr.uuid()]) report_progress(log.debug, rps, 1.) finally: del cf_writer cf.close() log.info("Done")
def __init__(self, json_config): """ Initialize application based of supplied JSON configuration :param json_config: JSON configuration dictionary :type json_config: dict """ super(NearestNeighborServiceServer, self).__init__(json_config) self.update_index = json_config['update_descriptor_index'] # Descriptor factory setup self._log.info("Initializing DescriptorElementFactory") self.descr_elem_factory = DescriptorElementFactory.from_config( self.json_config['descriptor_factory'] ) #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = None if self.update_index: self._log.info("Initializing DescriptorIndex to update") #: :type: smqtk.representation.DescriptorIndex | None self.descr_index = plugin.from_plugin_config( json_config['descriptor_index'], get_descriptor_index_impls() ) #: :type: smqtk.algorithms.NearestNeighborsIndex self.nn_index = plugin.from_plugin_config( json_config['nn_index'], get_nn_index_impls() ) #: :type: smqtk.algorithms.DescriptorGenerator self.descriptor_generator_inst = plugin.from_plugin_config( self.json_config['descriptor_generator'], get_descriptor_generator_impls() ) @self.route("/count", methods=['GET']) def count(): """ Return the number of elements represented in this index. """ return flask.jsonify(**{ "count": self.nn_index.count(), }) @self.route("/compute/<path:uri>", methods=["POST"]) def compute(uri): """ Compute the descriptor for a URI specified data element using the configured descriptor generator. See ``compute_nearest_neighbors`` method docstring for URI specifications accepted. If the a descriptor index was configured and update was turned on, we add the computed descriptor to the index. JSON Return format:: { "success": <bool> "message": <str> "descriptor": <None|list[float]> "reference_uri": <str> } :param uri: URI data specification. """ descriptor = None try: descriptor = self.generate_descriptor_for_uri(uri) message = "Descriptor generated" descriptor = list(map(float, descriptor.vector())) except ValueError as ex: message = "Input value issue: %s" % str(ex) except RuntimeError as ex: message = "Descriptor extraction failure: %s" % str(ex) return flask.jsonify( success=descriptor is not None, message=message, descriptor=descriptor, reference_uri=uri, ) @self.route("/nn/<path:uri>") @self.route("/nn/n=<int:n>/<path:uri>") @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>") def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None): """ Data modes for upload/use: - local filepath - base64 - http/s URL - existing data/descriptor UUID The following sub-sections detail how different URI's can be used. Local Filepath -------------- The URI string must be prefixed with ``file://``, followed by the full path to the data file to describe. Base 64 data ------------ The URI string must be prefixed with "base64://", followed by the base64 encoded string. This mode also requires an additional ``?content_type=`` to provide data content type information. This mode saves the encoded data to temporary file for processing. HTTP/S address -------------- This is the default mode when the URI prefix is none of the above. This uses the requests module to locally download a data file for processing. Existing Data/Descriptor by UUID -------------------------------- When given a uri prefixed with "uuid://", we interpret the remainder of the uri as the UUID of a descriptor already present in the configured descriptor index. If the given UUID is not present in the index, a KeyError is raised. JSON Return format ------------------ { "success": <bool> "message": <str> "neighbors": <None|list[float]> "reference_uri": <str> } :param n: Number of neighbors to query for :param start_i: The starting index of the neighbor vectors to slice into for return. :param end_i: The ending index of the neighbor vectors to slice into for return. :type uri: str """ descriptor = None try: descriptor = self.generate_descriptor_for_uri(uri) message = "descriptor computed" except ValueError as ex: message = "Input data issue: %s" % str(ex) except RuntimeError as ex: message = "Descriptor generation failure: %s" % str(ex) # Base pagination slicing based on provided start and end indices, # otherwise clamp to beginning/ending of queried neighbor sequence. page_slice = slice(start_i or 0, end_i or n) neighbors = [] dists = [] if descriptor is not None: try: neighbors, dists = \ self.nn_index.nn(descriptor, n) except ValueError as ex: message = "Descriptor or index related issue: %s" % str(ex) # TODO: Return the optional descriptor vectors for the neighbors # noinspection PyTypeChecker d = { "success": bool(descriptor is not None), "message": message, "neighbors": [n.uuid() for n in neighbors[page_slice]], "distances": dists[page_slice], "reference_uri": uri } return flask.jsonify(d)
def classify_files(config, label, file_globs): log = logging.getLogger(__name__) #: :type: smqtk.algorithms.Classifier classifier = \ plugin.from_plugin_config(config['classifier'], get_classifier_impls()) def log_avaialable_labels(): log.info("Available classifier labels:") for l in classifier.get_labels(): log.info("- %s", l) if label is None: log_avaialable_labels() return elif label not in classifier.get_labels(): log.error("Invalid classification label provided to compute and filter " "on: '%s'", label) log_avaialable_labels() return log.info("Collecting files from globs") #: :type: list[DataFileElement] data_elements = [] uuid2filepath = {} for g in file_globs: if os.path.isfile(g): d = DataFileElement(g) data_elements.append(d) uuid2filepath[d.uuid()] = g else: log.debug("expanding glob: %s", g) for fp in glob.iglob(g): d = DataFileElement(fp) data_elements.append(d) uuid2filepath[d.uuid()] = fp if not data_elements: raise RuntimeError("No files provided for classification.") log.info("Computing descriptors") descriptor_factory = \ DescriptorElementFactory.from_config(config['descriptor_factory']) #: :type: smqtk.algorithms.DescriptorGenerator descriptor_generator = \ plugin.from_plugin_config(config['descriptor_generator'], get_descriptor_generator_impls()) descr_map = descriptor_generator\ .compute_descriptor_async(data_elements, descriptor_factory) log.info("Classifying descriptors") classification_factory = ClassificationElementFactory \ .from_config(config['classification_factory']) classification_map = classifier\ .classify_async(list(descr_map.values()), classification_factory) log.info("Printing input file paths that classified as the given label.") # map of UUID to filepath: uuid2c = dict((c.uuid, c) for c in six.itervalues(classification_map)) for data in data_elements: d_uuid = data.uuid() log.debug("'{}' classification map: {}".format( uuid2filepath[d_uuid], uuid2c[d_uuid].get_classification() )) if uuid2c[d_uuid].max_label() == label: print(uuid2filepath[d_uuid])