Exemple #1
0
    def get_default_config(cls):
        """
        Generate and return a default configuration dictionary for this class.
        This will be primarily used for generating what the configuration
        dictionary would look like for this class without instantiating it.

        :return: Default configuration dictionary for the class.
        :rtype: dict

        """
        c = super(NearestNeighborServiceServer, cls).get_default_config()
        merge_dict(
            c, {
                "descriptor_factory":
                DescriptorElementFactory.get_default_config(),
                "descriptor_generator":
                plugin.make_config(get_descriptor_generator_impls()),
                "nn_index":
                plugin.make_config(get_nn_index_impls()),
                "descriptor_index":
                plugin.make_config(get_descriptor_index_impls()),
                "update_descriptor_index":
                False,
            })
        return c
Exemple #2
0
    def get_default_config(cls):
        c = super(SmqtkClassifierService, cls).get_default_config()

        c[cls.CONFIG_ENABLE_CLASSIFIER_REMOVAL] = False

        # Static classifier configurations
        c[cls.CONFIG_CLASSIFIER_COLLECTION] = \
            ClassifierCollection.get_default_config()
        # Classification element factory for new classification results.
        c[cls.CONFIG_CLASSIFICATION_FACTORY] = \
            ClassificationElementFactory.get_default_config()
        # Descriptor generator for new content
        c[cls.CONFIG_DESCRIPTOR_GENERATOR] = smqtk.utils.plugin.make_config(
            get_descriptor_generator_impls()
        )
        # Descriptor factory for new content descriptors
        c[cls.CONFIG_DESCRIPTOR_FACTORY] = \
            DescriptorElementFactory.get_default_config()
        # from-IQR-state *supervised* classifier configuration
        c[cls.CONFIG_IQR_CLASSIFIER] = smqtk.utils.plugin.make_config(
            get_classifier_impls(
                sub_interface=SupervisedClassifier
            )
        )
        c[cls.CONFIG_IMMUTABLE_LABELS] = []

        return c
Exemple #3
0
def run_file_list(c,
                  filelist_filepath,
                  checkpoint_filepath,
                  batch_size=None,
                  check_image=False):
    """
    Top level function handling configuration and inputs/outputs.

    :param c: Configuration dictionary (JSON)
    :type c: dict

    :param filelist_filepath: Path to a text file that lists paths to data
        files, separated by new lines.
    :type filelist_filepath: str

    :param checkpoint_filepath: Output file to which we write input filepath to
        SHA1 (UUID) relationships.
    :type checkpoint_filepath:

    :param batch_size: Optional batch size (None default) of data elements to
        process / descriptors to compute at a time. This causes files and
        stores to be written to incrementally during processing instead of
        one single batch transaction at a time.
    :type batch_size:

    :param check_image: Enable checking image loading from file before queueing
        that file for processing. If the check fails, the file is skipped
        instead of a halting exception being raised.
    :type check_image: bool

    """
    log = logging.getLogger(__name__)

    file_paths = [l.strip() for l in open(filelist_filepath)]

    log.info("Making descriptor factory")
    factory = DescriptorElementFactory.from_config(c['descriptor_factory'])

    log.info("Making descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(c['descriptor_index'],
                                                 get_descriptor_index_impls())

    log.info("Making descriptor generator '%s'",
             c['descriptor_generator']['type'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    generator = plugin.from_plugin_config(c['descriptor_generator'],
                                          get_descriptor_generator_impls())

    def test_image_load(dfe):
        try:
            PIL.Image.open(io.BytesIO(dfe.get_bytes()))
            return True
        except IOError, ex:
            # noinspection PyProtectedMember
            log.warn(
                "Failed to convert '%s' bytes into an image "
                "(error: %s). Skipping", dfe._filepath, str(ex))
            return False
Exemple #4
0
def default_config():
    return {
        "descriptor_generator":
            plugin.make_config(get_descriptor_generator_impls()),
        "descriptor_factory": DescriptorElementFactory.get_default_config(),
        "descriptor_index":
            plugin.make_config(get_descriptor_index_impls())
    }
Exemple #5
0
def get_default_config():
    return {
        "descriptor_factory":
        DescriptorElementFactory.get_default_config(),
        "descriptor_generator":
        plugin.make_config(get_descriptor_generator_impls()),
        "classification_factory":
        ClassificationElementFactory.get_default_config(),
        "classifier":
        plugin.make_config(get_classifier_impls()),
    }
Exemple #6
0
def get_default_config():
    return {
        "descriptor_factory":
            DescriptorElementFactory.get_default_config(),
        "descriptor_generator":
            plugin.make_config(get_descriptor_generator_impls()),
        "classification_factory":
            ClassificationElementFactory.get_default_config(),
        "classifier":
            plugin.make_config(get_classifier_impls()),
    }
def run_file_list(c, filelist_filepath, checkpoint_filepath, batch_size=None,
                  check_image=False):
    """
    Top level function handling configuration and inputs/outputs.

    :param c: Configuration dictionary (JSON)
    :type c: dict

    :param filelist_filepath: Path to a text file that lists paths to data
        files, separated by new lines.
    :type filelist_filepath: str

    :param checkpoint_filepath: Output file to which we write input filepath to
        SHA1 (UUID) relationships.
    :type checkpoint_filepath:

    :param batch_size: Optional batch size (None default) of data elements to
        process / descriptors to compute at a time. This causes files and
        stores to be written to incrementally during processing instead of
        one single batch transaction at a time.
    :type batch_size:

    """
    log = logging.getLogger(__name__)

    file_paths = [l.strip() for l in open(filelist_filepath)]

    log.info("Making descriptor factory")
    factory = DescriptorElementFactory.from_config(c['descriptor_factory'])

    log.info("Making descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(c['descriptor_index'],
                                                 get_descriptor_index_impls())

    log.info("Making descriptor generator '%s'",
             c['descriptor_generator']['type'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    generator = plugin.from_plugin_config(c['descriptor_generator'],
                                          get_descriptor_generator_impls())

    def test_image_load(dfe):
        try:
            PIL.Image.open(io.BytesIO(dfe.get_bytes()))
            return True
        except IOError, ex:
            # noinspection PyProtectedMember
            log.warn("Failed to convert '%s' bytes into an image "
                     "(error: %s). Skipping",
                     dfe._filepath, str(ex))
            return False
Exemple #8
0
    def get_default_config(cls):
        """
        Generate and return a default configuration dictionary for this class.
        This will be primarily used for generating what the configuration
        dictionary would look like for this class without instantiating it.

        :return: Default configuration dictionary for the class.
        :rtype: dict

        """
        c = super(NearestNeighborServiceServer, cls).get_default_config()
        merge_dict(c, {
            "descriptor_factory": DescriptorElementFactory.get_default_config(),
            "descriptor_generator":
                plugin.make_config(get_descriptor_generator_impls()),
            "nn_index": plugin.make_config(get_nn_index_impls()),
            "descriptor_index":
                plugin.make_config(get_descriptor_index_impls()),
            "update_descriptor_index": False,
        })
        return c
Exemple #9
0
def classify_files(config, label, file_globs):
    log = logging.getLogger(__name__)

    #: :type: smqtk.algorithms.Classifier
    classifier = \
        plugin.from_plugin_config(config['classifier'],
                                  get_classifier_impls())

    def log_avaialable_labels():
        log.info("Available classifier labels:")
        for l in classifier.get_labels():
            log.info("- %s", l)

    if label is None:
        log_avaialable_labels()
        return
    elif label not in classifier.get_labels():
        log.error(
            "Invalid classification label provided to compute and filter "
            "on: '%s'", label)
        log_avaialable_labels()
        return

    log.info("Collecting files from globs")
    #: :type: list[DataFileElement]
    data_elements = []
    uuid2filepath = {}
    for g in file_globs:
        if os.path.isfile(g):
            d = DataFileElement(g)
            data_elements.append(d)
            uuid2filepath[d.uuid()] = g
        else:
            log.debug("expanding glob: %s", g)
            for fp in glob.iglob(g):
                d = DataFileElement(fp)
                data_elements.append(d)
                uuid2filepath[d.uuid()] = fp
    if not data_elements:
        raise RuntimeError("No files provided for classification.")

    log.info("Computing descriptors")
    descriptor_factory = \
        DescriptorElementFactory.from_config(config['descriptor_factory'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    descriptor_generator = \
        plugin.from_plugin_config(config['descriptor_generator'],
                                  get_descriptor_generator_impls())
    descr_map = descriptor_generator\
        .compute_descriptor_async(data_elements, descriptor_factory)

    log.info("Classifying descriptors")
    classification_factory = ClassificationElementFactory \
        .from_config(config['classification_factory'])
    classification_map = classifier\
        .classify_async(descr_map.values(), classification_factory)

    log.info("Printing input file paths that classified as the given label.")
    # map of UUID to filepath:
    uuid2c = dict((c.uuid, c) for c in classification_map.itervalues())
    for data in data_elements:
        if uuid2c[data.uuid()].max_label() == label:
            print uuid2filepath[data.uuid()]
Exemple #10
0
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(NearestNeighborServiceServer, self).__init__(json_config)

        self.update_index = json_config['update_descriptor_index']

        # Descriptor factory setup
        self._log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory'])

        #: :type: smqtk.representation.DescriptorIndex | None
        self.descr_index = None
        if self.update_index:
            self._log.info("Initializing DescriptorIndex to update")
            #: :type: smqtk.representation.DescriptorIndex | None
            self.descr_index = plugin.from_plugin_config(
                json_config['descriptor_index'], get_descriptor_index_impls())

        #: :type: smqtk.algorithms.NearestNeighborsIndex
        self.nn_index = plugin.from_plugin_config(json_config['nn_index'],
                                                  get_nn_index_impls())

        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_generator_inst = plugin.from_plugin_config(
            self.json_config['descriptor_generator'],
            get_descriptor_generator_impls())

        @self.route("/count", methods=['GET'])
        def count():
            """
            Return the number of elements represented in this index.
            """
            return flask.jsonify(**{
                "count": self.nn_index.count(),
            })

        @self.route("/compute/<path:uri>", methods=["POST"])
        def compute(uri):
            """
            Compute the descriptor for a URI specified data element using the
            configured descriptor generator.

            See ``compute_nearest_neighbors`` method docstring for URI
            specifications accepted.

            If the a descriptor index was configured and update was turned on,
            we add the computed descriptor to the index.

            JSON Return format::
                {
                    "success": <bool>

                    "message": <str>

                    "descriptor": <None|list[float]>

                    "reference_uri": <str>
                }

            :param uri: URI data specification.

            """
            descriptor = None
            try:
                descriptor = self.generate_descriptor_for_uri(uri)
                message = "Descriptor generated"
                descriptor = list(map(float, descriptor.vector()))
            except ValueError as ex:
                message = "Input value issue: %s" % str(ex)
            except RuntimeError as ex:
                message = "Descriptor extraction failure: %s" % str(ex)

            return flask.jsonify(
                success=descriptor is not None,
                message=message,
                descriptor=descriptor,
                reference_uri=uri,
            )

        @self.route("/nn/<path:uri>")
        @self.route("/nn/n=<int:n>/<path:uri>")
        @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>")
        def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None):
            """
            Data modes for upload/use:

                - local filepath
                - base64
                - http/s URL
                - existing data/descriptor UUID

            The following sub-sections detail how different URI's can be used.

            Local Filepath
            --------------
            The URI string must be prefixed with ``file://``, followed by the
            full path to the data file to describe.

            Base 64 data
            ------------
            The URI string must be prefixed with "base64://", followed by the
            base64 encoded string. This mode also requires an additional
            ``?content_type=`` to provide data content type information. This
            mode saves the encoded data to temporary file for processing.

            HTTP/S address
            --------------
            This is the default mode when the URI prefix is none of the above.
            This uses the requests module to locally download a data file
            for processing.

            Existing Data/Descriptor by UUID
            --------------------------------
            When given a uri prefixed with "uuid://", we interpret the remainder
            of the uri as the UUID of a descriptor already present in the
            configured descriptor index. If the given UUID is not present in the
            index, a KeyError is raised.

            JSON Return format
            ------------------
                {
                    "success": <bool>

                    "message": <str>

                    "neighbors": <None|list[float]>

                    "reference_uri": <str>
                }

            :param n: Number of neighbors to query for
            :param start_i: The starting index of the neighbor vectors to slice
                into for return.
            :param end_i: The ending index of the neighbor vectors to slice
                into for return.
            :type uri: str

            """
            descriptor = None
            try:
                descriptor = self.generate_descriptor_for_uri(uri)
                message = "descriptor computed"
            except ValueError as ex:
                message = "Input data issue: %s" % str(ex)
            except RuntimeError as ex:
                message = "Descriptor generation failure: %s" % str(ex)

            # Base pagination slicing based on provided start and end indices,
            # otherwise clamp to beginning/ending of queried neighbor sequence.
            page_slice = slice(start_i or 0, end_i or n)
            neighbors = []
            dists = []
            if descriptor is not None:
                try:
                    neighbors, dists = \
                        self.nn_index.nn(descriptor, n)
                except ValueError as ex:
                    message = "Descriptor or index related issue: %s" % str(ex)

            # TODO: Return the optional descriptor vectors for the neighbors
            # noinspection PyTypeChecker
            d = {
                "success": bool(descriptor is not None),
                "message": message,
                "neighbors": [n.uuid() for n in neighbors[page_slice]],
                "distances": dists[page_slice],
                "reference_uri": uri
            }
            return flask.jsonify(d)
Exemple #11
0
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(NearestNeighborServiceServer, self).__init__(json_config)

        self.update_index = json_config['update_descriptor_index']

        # Descriptor factory setup
        self._log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory']
        )

        #: :type: smqtk.representation.DescriptorIndex | None
        self.descr_index = None
        if self.update_index:
            self._log.info("Initializing DescriptorIndex to update")
            #: :type: smqtk.representation.DescriptorIndex | None
            self.descr_index = plugin.from_plugin_config(
                json_config['descriptor_index'],
                get_descriptor_index_impls()
            )

        #: :type: smqtk.algorithms.NearestNeighborsIndex
        self.nn_index = plugin.from_plugin_config(
            json_config['nn_index'],
            get_nn_index_impls()
        )

        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_generator_inst = plugin.from_plugin_config(
            self.json_config['descriptor_generator'],
            get_descriptor_generator_impls()
        )

        @self.route("/count", methods=['GET'])
        def count():
            """
            Return the number of elements represented in this index.
            """
            return flask.jsonify(**{
                "count": self.nn_index.count(),
            })

        @self.route("/compute/<path:uri>", methods=["POST"])
        def compute(uri):
            """
            Compute the descriptor for a URI specified data element using the
            configured descriptor generator.

            If the a descriptor index was configured and update was turned on,
            we add the computed descriptor to the index.

            JSON Return format::
                {
                    "success": <bool>

                    "message": <str>

                    "descriptor": <None|list[float]>

                    "reference_uri": <str>
                }

            :param uri: URI data specification.

            """
            descriptor = None
            try:
                _, descriptor = self.generate_descriptor_for_uri(uri)
                message = "Descriptor generated"
                descriptor = map(float, descriptor.vector())
            except ValueError, ex:
                message = "Input value issue: %s" % str(ex)
            except RuntimeError, ex:
                message = "Descriptor extraction failure: %s" % str(ex)
Exemple #12
0
def main():
    parser = cli_parser()
    args = parser.parse_args()

    #
    # Setup logging
    #
    if not logging.getLogger().handlers:
        if args.verbose:
            bin_utils.initialize_logging(logging.getLogger(), logging.DEBUG)
        else:
            bin_utils.initialize_logging(logging.getLogger(), logging.INFO)
    log = logging.getLogger("smqtk.scripts.iqr_app_model_generation")

    search_app_config = json.loads(jsmin.jsmin(open(args.config).read()))

    #
    # Input parameters
    #
    # The following dictionaries are JSON configurations that are used to
    # configure the various data structures and algorithms needed for the IQR demo
    # application. Values here can be changed to suit your specific data and
    # algorithm needs.
    #
    # See algorithm implementation doc-strings for more information on configuration
    # parameters (see implementation class ``__init__`` method).
    #

    # base actions on a specific IQR tab configuration (choose index here)
    if args.tab < 0 or args.tab > (len(search_app_config["iqr_tabs"]) - 1):
        log.error("Invalid tab number provided.")
        exit(1)

    search_app_iqr_config = search_app_config["iqr_tabs"][args.tab]

    # Configure DataSet implementation and parameters
    data_set_config = search_app_iqr_config['data_set']

    # Configure DescriptorGenerator algorithm implementation, parameters and
    # persistant model component locations (if implementation has any).
    descriptor_generator_config = search_app_iqr_config['descr_generator']

    # Configure NearestNeighborIndex algorithm implementation, parameters and
    # persistant model component locations (if implementation has any).
    nn_index_config = search_app_iqr_config['nn_index']

    # Configure RelevancyIndex algorithm implementation, parameters and
    # persistant model component locations (if implementation has any).
    #
    # The LibSvmHikRelevancyIndex implementation doesn't actually build a persistant
    # model (or doesn't have to that is), but we're leaving this block here in
    # anticipation of other potential implementations in the future.
    #
    rel_index_config = search_app_iqr_config['rel_index_config']

    # Configure DescriptorElementFactory instance, which defines what implementation
    # of DescriptorElement to use for storing generated descriptor vectors below.
    descriptor_elem_factory_config = search_app_iqr_config[
        'descriptor_factory']

    #
    # Initialize data/algorithms
    #
    # Constructing appropriate data structures and algorithms, needed for the IQR
    # demo application, in preparation for model training.
    #

    descriptor_elem_factory = \
        representation.DescriptorElementFactory \
        .from_config(descriptor_elem_factory_config)

    #: :type: representation.DataSet
    data_set = \
        plugin.from_plugin_config(data_set_config,
                                  representation.get_data_set_impls())
    #: :type: algorithms.DescriptorGenerator
    descriptor_generator = \
        plugin.from_plugin_config(descriptor_generator_config,
                                  algorithms.get_descriptor_generator_impls())

    #: :type: algorithms.NearestNeighborsIndex
    nn_index = \
        plugin.from_plugin_config(nn_index_config,
                                  algorithms.get_nn_index_impls())

    #: :type: algorithms.RelevancyIndex
    rel_index = \
        plugin.from_plugin_config(rel_index_config,
                                  algorithms.get_relevancy_index_impls())

    #
    # Build models
    #
    # Perform the actual building of the models.
    #

    # Add data files to DataSet
    DataFileElement = representation.get_data_element_impls(
    )["DataFileElement"]

    for fp in args.input_files:
        fp = osp.expanduser(fp)
        if osp.isfile(fp):
            data_set.add_data(DataFileElement(fp))
        else:
            log.debug("Expanding glob: %s" % fp)
            for g in glob.iglob(fp):
                data_set.add_data(DataFileElement(g))

    # Generate a mode if the generator defines a known generation method.
    if hasattr(descriptor_generator, "generate_model"):
        descriptor_generator.generate_model(data_set)
    # Add other if-else cases for other known implementation-specific generation
    # methods stubs

    # Generate descriptors of data for building NN index.
    data2descriptor = descriptor_generator.compute_descriptor_async(
        data_set, descriptor_elem_factory)

    try:
        nn_index.build_index(six.itervalues(data2descriptor))
    except RuntimeError:
        # Already built model, so skipping this step
        pass

    rel_index.build_index(six.itervalues(data2descriptor))
Exemple #13
0
def main():
    args = cli_parser().parse_args()

    ui_config_filepath, iqr_config_filepath = args.config
    llevel = logging.DEBUG if args.verbose else logging.INFO
    tab = args.tab
    input_files_globs = args.input_files

    # Not using `bin_utils.utility_main_helper`` due to deviating from single-
    # config-with-default usage.
    bin_utils.initialize_logging(logging.getLogger('smqtk'), llevel)
    bin_utils.initialize_logging(logging.getLogger('__main__'), llevel)
    log = logging.getLogger(__name__)

    log.info("Loading UI config: '{}'".format(ui_config_filepath))
    ui_config, ui_config_loaded = bin_utils.load_config(ui_config_filepath)
    log.info("Loading IQR config: '{}'".format(iqr_config_filepath))
    iqr_config, iqr_config_loaded = bin_utils.load_config(iqr_config_filepath)
    if not (ui_config_loaded and iqr_config_loaded):
        raise RuntimeError("One or both configuration files failed to load.")

    # Ensure the given "tab" exists in UI configuration.
    if tab is None:
        log.error("No configuration tab provided to drive model generation.")
        exit(1)
    if tab not in ui_config["iqr_tabs"]:
        log.error("Invalid tab provided: '{}'. Available tags: {}"
                  .format(tab, list(ui_config["iqr_tabs"])))
        exit(1)

    #
    # Gather Configurations
    #
    log.info("Extracting plugin configurations")

    ui_tab_config = ui_config["iqr_tabs"][tab]
    iqr_plugins_config = iqr_config['iqr_service']['plugins']

    # Configure DataSet implementation and parameters
    data_set_config = ui_tab_config['data_set']

    # Configure DescriptorElementFactory instance, which defines what
    # implementation of DescriptorElement to use for storing generated
    # descriptor vectors below.
    descriptor_elem_factory_config = iqr_plugins_config['descriptor_factory']

    # Configure DescriptorGenerator algorithm implementation, parameters and
    # persistent model component locations (if implementation has any).
    descriptor_generator_config = iqr_plugins_config['descriptor_generator']

    # Configure NearestNeighborIndex algorithm implementation, parameters and
    # persistent model component locations (if implementation has any).
    nn_index_config = iqr_plugins_config['neighbor_index']

    #
    # Initialize data/algorithms
    #
    # Constructing appropriate data structures and algorithms, needed for the
    # IQR demo application, in preparation for model training.
    #
    log.info("Instantiating plugins")
    #: :type: representation.DataSet
    data_set = \
        plugin.from_plugin_config(data_set_config,
                                  representation.get_data_set_impls())
    descriptor_elem_factory = \
        representation.DescriptorElementFactory \
        .from_config(descriptor_elem_factory_config)
    #: :type: algorithms.DescriptorGenerator
    descriptor_generator = \
        plugin.from_plugin_config(descriptor_generator_config,
                                  algorithms.get_descriptor_generator_impls())
    #: :type: algorithms.NearestNeighborsIndex
    nn_index = \
        plugin.from_plugin_config(nn_index_config,
                                  algorithms.get_nn_index_impls())

    #
    # Build models
    #
    log.info("Adding files to dataset '{}'".format(data_set))
    for g in input_files_globs:
        g = osp.expanduser(g)
        if osp.isfile(g):
            data_set.add_data(DataFileElement(g, readonly=True))
        else:
            log.debug("Expanding glob: %s" % g)
            for fp in glob.iglob(g):
                data_set.add_data(DataFileElement(fp, readonly=True))

    # Generate a model if the generator defines a known generation method.
    try:
        log.debug("descriptor generator as model to generate?")
        descriptor_generator.generate_model(data_set)
    except AttributeError as ex:
        log.debug("descriptor generator as model to generate - Nope: {}"
                  .format(str(ex)))

    # Generate descriptors of data for building NN index.
    log.info("Computing descriptors for data set with {}"
             .format(descriptor_generator))
    data2descriptor = descriptor_generator.compute_descriptor_async(
        data_set, descriptor_elem_factory
    )

    # Possible additional support steps before building NNIndex
    try:
        # Fit the LSH index functor
        log.debug("Has LSH Functor to fit?")
        nn_index.lsh_functor.fit(six.itervalues(data2descriptor))
    except AttributeError as ex:
        log.debug("Has LSH Functor to fit - Nope: {}".format(str(ex)))

    log.info("Building nearest neighbors index {}".format(nn_index))
    nn_index.build_index(six.itervalues(data2descriptor))
Exemple #14
0
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(NearestNeighborServiceServer, self).__init__(json_config)

        self.update_index = json_config['update_descriptor_index']

        # Descriptor factory setup
        self._log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory'])

        #: :type: smqtk.representation.DescriptorIndex | None
        self.descr_index = None
        if self.update_index:
            self._log.info("Initializing DescriptorIndex to update")
            #: :type: smqtk.representation.DescriptorIndex | None
            self.descr_index = plugin.from_plugin_config(
                json_config['descriptor_index'], get_descriptor_index_impls())

        #: :type: smqtk.algorithms.NearestNeighborsIndex
        self.nn_index = plugin.from_plugin_config(json_config['nn_index'],
                                                  get_nn_index_impls())

        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_generator_inst = plugin.from_plugin_config(
            self.json_config['descriptor_generator'],
            get_descriptor_generator_impls())

        @self.route("/count", methods=['GET'])
        def count():
            """
            Return the number of elements represented in this index.
            """
            return flask.jsonify(**{
                "count": self.nn_index.count(),
            })

        @self.route("/compute/<path:uri>", methods=["POST"])
        def compute(uri):
            """
            Compute the descriptor for a URI specified data element using the
            configured descriptor generator.

            If the a descriptor index was configured and update was turned on,
            we add the computed descriptor to the index.

            JSON Return format::
                {
                    "success": <bool>

                    "message": <str>

                    "descriptor": <None|list[float]>

                    "reference_uri": <str>
                }

            :param uri: URI data specification.

            """
            descriptor = None
            try:
                _, descriptor = self.generate_descriptor_for_uri(uri)
                message = "Descriptor generated"
                descriptor = map(float, descriptor.vector())
            except ValueError, ex:
                message = "Input value issue: %s" % str(ex)
            except RuntimeError, ex:
                message = "Descriptor extraction failure: %s" % str(ex)
def main():
    parser = cli_parser()
    args = parser.parse_args()

    #
    # Setup logging
    #
    if not logging.getLogger().handlers:
        if args.verbose:
            bin_utils.initialize_logging(logging.getLogger(), logging.DEBUG)
        else:
            bin_utils.initialize_logging(logging.getLogger(), logging.INFO)
    log = logging.getLogger("smqtk.scripts.iqr_app_model_generation")

    search_app_config = json.loads(jsmin.jsmin(open(args.config).read()))

    #
    # Input parameters
    #
    # The following dictionaries are JSON configurations that are used to
    # configure the various data structures and algorithms needed for the IQR demo
    # application. Values here can be changed to suit your specific data and
    # algorithm needs.
    #
    # See algorithm implementation doc-strings for more information on configuration
    # parameters (see implementation class ``__init__`` method).
    #

    # base actions on a specific IQR tab configuration (choose index here)
    if args.tab <  0 or args.tab > (len(search_app_config["iqr_tabs"]) - 1):
        log.error("Invalid tab number provided.")
        exit(1)

    search_app_iqr_config = search_app_config["iqr_tabs"][args.tab]

    # Configure DataSet implementation and parameters
    data_set_config = search_app_iqr_config['data_set']

    # Configure DescriptorGenerator algorithm implementation, parameters and
    # persistant model component locations (if implementation has any).
    descriptor_generator_config = search_app_iqr_config['descr_generator']

    # Configure NearestNeighborIndex algorithm implementation, parameters and
    # persistant model component locations (if implementation has any).
    nn_index_config = search_app_iqr_config['nn_index']

    # Configure RelevancyIndex algorithm implementation, parameters and
    # persistant model component locations (if implementation has any).
    #
    # The LibSvmHikRelevancyIndex implementation doesn't actually build a persistant
    # model (or doesn't have to that is), but we're leaving this block here in
    # anticipation of other potential implementations in the future.
    #
    rel_index_config = search_app_iqr_config['rel_index_config']

    # Configure DescriptorElementFactory instance, which defines what implementation
    # of DescriptorElement to use for storing generated descriptor vectors below.
    descriptor_elem_factory_config = search_app_iqr_config['descriptor_factory']

    #
    # Initialize data/algorithms
    #
    # Constructing appropriate data structures and algorithms, needed for the IQR
    # demo application, in preparation for model training.
    #

    descriptor_elem_factory = \
        representation.DescriptorElementFactory \
        .from_config(descriptor_elem_factory_config)

    #: :type: representation.DataSet
    data_set = \
        plugin.from_plugin_config(data_set_config,
                                  representation.get_data_set_impls())
    #: :type: algorithms.DescriptorGenerator
    descriptor_generator = \
        plugin.from_plugin_config(descriptor_generator_config,
                                  algorithms.get_descriptor_generator_impls())

    #: :type: algorithms.NearestNeighborsIndex
    nn_index = \
        plugin.from_plugin_config(nn_index_config,
                                  algorithms.get_nn_index_impls())

    #: :type: algorithms.RelevancyIndex
    rel_index = \
        plugin.from_plugin_config(rel_index_config,
                                  algorithms.get_relevancy_index_impls())

    #
    # Build models
    #
    # Perform the actual building of the models.
    #

    # Add data files to DataSet
    DataFileElement = representation.get_data_element_impls()["DataFileElement"]

    for fp in args.input_files:
        fp = osp.expanduser(fp)
        if osp.isfile(fp):
            data_set.add_data(DataFileElement(fp))
        else:
            log.debug("Expanding glob: %s" % fp)
            for g in glob.iglob(fp):
                data_set.add_data(DataFileElement(g))

    # Generate a mode if the generator defines a known generation method.
    if hasattr(descriptor_generator, "generate_model"):
        descriptor_generator.generate_model(data_set)
    # Add other if-else cases for other known implementation-specific generation
    # methods stubs

    # Generate descriptors of data for building NN index.
    data2descriptor = descriptor_generator.compute_descriptor_async(
        data_set, descriptor_elem_factory
    )

    try:
        nn_index.build_index(data2descriptor.itervalues())
    except RuntimeError:
        # Already built model, so skipping this step
        pass

    rel_index.build_index(data2descriptor.itervalues())
Exemple #16
0
def run_file_list(c,
                  filelist_filepath,
                  checkpoint_filepath,
                  batch_size=None,
                  check_image=False):
    """
    Top level function handling configuration and inputs/outputs.

    :param c: Configuration dictionary (JSON)
    :type c: dict

    :param filelist_filepath: Path to a text file that lists paths to data
        files, separated by new lines.
    :type filelist_filepath: str

    :param checkpoint_filepath: Output file to which we write input filepath to
        SHA1 (UUID) relationships.
    :type checkpoint_filepath:

    :param batch_size: Optional batch size (None default) of data elements to
        process / descriptors to compute at a time. This causes files and
        stores to be written to incrementally during processing instead of
        one single batch transaction at a time.
    :type batch_size:

    :param check_image: Enable checking image loading from file before queueing
        that file for processing. If the check fails, the file is skipped
        instead of a halting exception being raised.
    :type check_image: bool

    """
    log = logging.getLogger(__name__)

    file_paths = [l.strip() for l in open(filelist_filepath)]

    log.info("Making descriptor factory")
    factory = DescriptorElementFactory.from_config(c['descriptor_factory'])

    log.info("Making descriptor index")
    #: :type: smqtk.representation.DescriptorIndex
    descriptor_index = plugin.from_plugin_config(c['descriptor_index'],
                                                 get_descriptor_index_impls())

    data_set = None
    if c['optional_data_set']['type'] is None:
        log.info("Not saving loaded data elements to data set")
    else:
        log.info("Initializing data set to append to")
        #: :type: smqtk.representation.DataSet
        data_set = plugin.from_plugin_config(c['optional_data_set'],
                                             get_data_set_impls())

    log.info("Making descriptor generator '%s'",
             c['descriptor_generator']['type'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    generator = plugin.from_plugin_config(c['descriptor_generator'],
                                          get_descriptor_generator_impls())

    def iter_valid_elements():
        def is_valid(file_path):
            dfe = DataFileElement(file_path)

            if is_valid_element(
                    dfe,
                    valid_content_types=generator.valid_content_types(),
                    check_image=check_image):
                return dfe
            else:
                return False

        data_elements = collections.deque()
        valid_files_filter = parallel.parallel_map(is_valid,
                                                   file_paths,
                                                   name="check-file-type",
                                                   use_multiprocessing=True)
        for dfe in valid_files_filter:
            if dfe:
                yield dfe
                if data_set is not None:
                    data_elements.append(dfe)
                    if batch_size and len(data_elements) == batch_size:
                        log.debug(
                            "Adding data element batch to set (size: %d)",
                            len(data_elements))
                        data_set.add_data(*data_elements)
                        data_elements.clear()
        # elements only collected if we have a data-set configured, so add any
        # still in the deque to the set
        if data_elements:
            log.debug("Adding data elements to set (size: %d",
                      len(data_elements))
            data_set.add_data(*data_elements)

    log.info("Computing descriptors")
    m = compute_many_descriptors(
        iter_valid_elements(),
        generator,
        factory,
        descriptor_index,
        batch_size=batch_size,
    )

    # Recording computed file paths and associated file UUIDs (SHA1)
    cf = open(checkpoint_filepath, 'w')
    cf_writer = csv.writer(cf)
    try:
        rps = [0] * 7
        for fp, descr in m:
            cf_writer.writerow([fp, descr.uuid()])
            report_progress(log.debug, rps, 1.)
    finally:
        del cf_writer
        cf.close()

    log.info("Done")
Exemple #17
0
    def __init__(self, json_config):
        """
        Initialize application based of supplied JSON configuration

        :param json_config: JSON configuration dictionary
        :type json_config: dict

        """
        super(NearestNeighborServiceServer, self).__init__(json_config)

        self.update_index = json_config['update_descriptor_index']

        # Descriptor factory setup
        self._log.info("Initializing DescriptorElementFactory")
        self.descr_elem_factory = DescriptorElementFactory.from_config(
            self.json_config['descriptor_factory']
        )

        #: :type: smqtk.representation.DescriptorIndex | None
        self.descr_index = None
        if self.update_index:
            self._log.info("Initializing DescriptorIndex to update")
            #: :type: smqtk.representation.DescriptorIndex | None
            self.descr_index = plugin.from_plugin_config(
                json_config['descriptor_index'],
                get_descriptor_index_impls()
            )

        #: :type: smqtk.algorithms.NearestNeighborsIndex
        self.nn_index = plugin.from_plugin_config(
            json_config['nn_index'],
            get_nn_index_impls()
        )

        #: :type: smqtk.algorithms.DescriptorGenerator
        self.descriptor_generator_inst = plugin.from_plugin_config(
            self.json_config['descriptor_generator'],
            get_descriptor_generator_impls()
        )

        @self.route("/count", methods=['GET'])
        def count():
            """
            Return the number of elements represented in this index.
            """
            return flask.jsonify(**{
                "count": self.nn_index.count(),
            })

        @self.route("/compute/<path:uri>", methods=["POST"])
        def compute(uri):
            """
            Compute the descriptor for a URI specified data element using the
            configured descriptor generator.

            See ``compute_nearest_neighbors`` method docstring for URI
            specifications accepted.

            If the a descriptor index was configured and update was turned on,
            we add the computed descriptor to the index.

            JSON Return format::
                {
                    "success": <bool>

                    "message": <str>

                    "descriptor": <None|list[float]>

                    "reference_uri": <str>
                }

            :param uri: URI data specification.

            """
            descriptor = None
            try:
                descriptor = self.generate_descriptor_for_uri(uri)
                message = "Descriptor generated"
                descriptor = list(map(float, descriptor.vector()))
            except ValueError as ex:
                message = "Input value issue: %s" % str(ex)
            except RuntimeError as ex:
                message = "Descriptor extraction failure: %s" % str(ex)

            return flask.jsonify(
                success=descriptor is not None,
                message=message,
                descriptor=descriptor,
                reference_uri=uri,
            )

        @self.route("/nn/<path:uri>")
        @self.route("/nn/n=<int:n>/<path:uri>")
        @self.route("/nn/n=<int:n>/<int:start_i>:<int:end_i>/<path:uri>")
        def compute_nearest_neighbors(uri, n=10, start_i=None, end_i=None):
            """
            Data modes for upload/use:

                - local filepath
                - base64
                - http/s URL
                - existing data/descriptor UUID

            The following sub-sections detail how different URI's can be used.

            Local Filepath
            --------------
            The URI string must be prefixed with ``file://``, followed by the
            full path to the data file to describe.

            Base 64 data
            ------------
            The URI string must be prefixed with "base64://", followed by the
            base64 encoded string. This mode also requires an additional
            ``?content_type=`` to provide data content type information. This
            mode saves the encoded data to temporary file for processing.

            HTTP/S address
            --------------
            This is the default mode when the URI prefix is none of the above.
            This uses the requests module to locally download a data file
            for processing.

            Existing Data/Descriptor by UUID
            --------------------------------
            When given a uri prefixed with "uuid://", we interpret the remainder
            of the uri as the UUID of a descriptor already present in the
            configured descriptor index. If the given UUID is not present in the
            index, a KeyError is raised.

            JSON Return format
            ------------------
                {
                    "success": <bool>

                    "message": <str>

                    "neighbors": <None|list[float]>

                    "reference_uri": <str>
                }

            :param n: Number of neighbors to query for
            :param start_i: The starting index of the neighbor vectors to slice
                into for return.
            :param end_i: The ending index of the neighbor vectors to slice
                into for return.
            :type uri: str

            """
            descriptor = None
            try:
                descriptor = self.generate_descriptor_for_uri(uri)
                message = "descriptor computed"
            except ValueError as ex:
                message = "Input data issue: %s" % str(ex)
            except RuntimeError as ex:
                message = "Descriptor generation failure: %s" % str(ex)

            # Base pagination slicing based on provided start and end indices,
            # otherwise clamp to beginning/ending of queried neighbor sequence.
            page_slice = slice(start_i or 0, end_i or n)
            neighbors = []
            dists = []
            if descriptor is not None:
                try:
                    neighbors, dists = \
                        self.nn_index.nn(descriptor, n)
                except ValueError as ex:
                    message = "Descriptor or index related issue: %s" % str(ex)

            # TODO: Return the optional descriptor vectors for the neighbors
            # noinspection PyTypeChecker
            d = {
                "success": bool(descriptor is not None),
                "message": message,
                "neighbors": [n.uuid() for n in neighbors[page_slice]],
                "distances": dists[page_slice],
                "reference_uri": uri
            }
            return flask.jsonify(d)
Exemple #18
0
def classify_files(config, label, file_globs):
    log = logging.getLogger(__name__)

    #: :type: smqtk.algorithms.Classifier
    classifier = \
        plugin.from_plugin_config(config['classifier'],
                                  get_classifier_impls())

    def log_avaialable_labels():
        log.info("Available classifier labels:")
        for l in classifier.get_labels():
            log.info("- %s", l)

    if label is None:
        log_avaialable_labels()
        return
    elif label not in classifier.get_labels():
        log.error("Invalid classification label provided to compute and filter "
                  "on: '%s'", label)
        log_avaialable_labels()
        return

    log.info("Collecting files from globs")
    #: :type: list[DataFileElement]
    data_elements = []
    uuid2filepath = {}
    for g in file_globs:
        if os.path.isfile(g):
            d = DataFileElement(g)
            data_elements.append(d)
            uuid2filepath[d.uuid()] = g
        else:
            log.debug("expanding glob: %s", g)
            for fp in glob.iglob(g):
                d = DataFileElement(fp)
                data_elements.append(d)
                uuid2filepath[d.uuid()] = fp
    if not data_elements:
        raise RuntimeError("No files provided for classification.")

    log.info("Computing descriptors")
    descriptor_factory = \
        DescriptorElementFactory.from_config(config['descriptor_factory'])
    #: :type: smqtk.algorithms.DescriptorGenerator
    descriptor_generator = \
        plugin.from_plugin_config(config['descriptor_generator'],
                                  get_descriptor_generator_impls())
    descr_map = descriptor_generator\
        .compute_descriptor_async(data_elements, descriptor_factory)

    log.info("Classifying descriptors")
    classification_factory = ClassificationElementFactory \
        .from_config(config['classification_factory'])
    classification_map = classifier\
        .classify_async(list(descr_map.values()), classification_factory)

    log.info("Printing input file paths that classified as the given label.")
    # map of UUID to filepath:
    uuid2c = dict((c.uuid, c) for c in six.itervalues(classification_map))
    for data in data_elements:
        d_uuid = data.uuid()
        log.debug("'{}' classification map: {}".format(
            uuid2filepath[d_uuid], uuid2c[d_uuid].get_classification()
        ))
        if uuid2c[d_uuid].max_label() == label:
            print(uuid2filepath[d_uuid])