Example #1
0
    def __setstate__(self, state):
        self.__dict__.update(state)

        self.svm_model_elem = \
            self.svm_model_uri and from_uri(self.svm_model_uri)
        self.svm_label_map_elem = \
            self.svm_label_map_uri and from_uri(self.svm_label_map_uri)

        # C libraries/pointers don't survive across processes.
        if '__LOCAL__' in state:
            # These would have gotten copied into dict during the updated.
            # The instance doesn't need to keep them around after this.
            del self.__dict__['__LOCAL__']
            del self.__dict__['__LOCAL_LABELS__']
            del self.__dict__['__LOCAL_MODEL__']

            fd, fp = tempfile.mkstemp()
            try:
                os.close(fd)

                self.svm_label_map = state['__LOCAL_LABELS__']

                # write model to file, then load via libSVM
                with open(fp, 'wb') as model_f:
                    model_f.write(state['__LOCAL_MODEL__'])

                fp_bytes = fp.encode('utf8')
                self.svm_model = svmutil.svm_load_model(fp_bytes)

            finally:
                os.remove(fp)
        else:
            self.svm_model = None
            self._reload_model()
    def test_from_uri_plugin_level(self) -> None:
        # will be absolute path
        test_file_path = os.path.join(TEST_DATA_DIR, "test_file.dat")
        print("Test file path:", test_file_path)

        e = from_uri(test_file_path)
        assert isinstance(e, DataFileElement)
        self.assertEqual(e._filepath, test_file_path)
        self.assertEqual(e.get_bytes(), b"")

        e = from_uri('file://' + test_file_path)
        assert isinstance(e, DataFileElement)
        self.assertEqual(e._filepath, test_file_path)
        self.assertEqual(e.get_bytes(), b"")
    def __init__(self, index_to_label_uri: str):
        super().__init__()

        # load label vector
        self.index_to_label_uri = index_to_label_uri
        self.label_vector = [
            line.strip()
            for line in from_uri(index_to_label_uri).to_buffered_reader()
        ]
Example #4
0
    def __init__(
        self,
        svm_model_uri: Optional[str] = None,
        svm_label_map_uri: Optional[str] = None,
        train_params: SVM_PARAM_MAPPING_T = {
            '-s': 0,  # C-SVC, assumed default if not provided
            '-t': 0,  # linear kernel
            '-b': 1,  # enable probability estimates
            '-c': 2,  # SVM parameter C
            # '-g': 0.0078125,  # initial gamma (1 / 128)
        },
        normalize: Optional[Union[int, float, str]] = None,
        n_jobs: Optional[int] = 4,
    ):
        super(LibSvmClassifier, self).__init__()

        self.svm_model_uri = svm_model_uri
        self.svm_label_map_uri = svm_label_map_uri

        # Elements will be None if input URI is None
        #: :type: None | smqtk.representation.DataElement
        self.svm_model_elem = \
            svm_model_uri and from_uri(svm_model_uri)
        #: :type: None | smqtk.representation.DataElement
        self.svm_label_map_elem = \
            svm_label_map_uri and from_uri(svm_label_map_uri)

        # Shallow copy to shield from modifying input.
        self.train_params = dict(train_params)
        self.normalize = normalize
        self.n_jobs = n_jobs
        # Validate normalization parameter by trying it on a random vector
        if normalize is not None:
            self._norm_vector(numpy.random.rand(8))

        # generated parameters
        self.svm_model: Optional[svm.svm_model] = None
        # dictionary mapping SVM integer labels to semantic labels
        self.svm_label_map: Dict[int, Hashable] = {}

        self._reload_model()
    def test_one_resolvable_option(self) -> None:
        """
        When at least one plugin can resolve a URI
        """
        def impl_generator() -> Iterable:
            return {UnresolvableElement, ResolvableElement}

        # URI that can be resolved by ResolvableElement
        self.assertIsInstance(from_uri("resolvable://data", impl_generator),
                              ResolvableElement)

        # bad URI even though something can resolve it
        self.assertRaises(InvalidUriError, from_uri, 'not_resolvable',
                          impl_generator)
Example #6
0
    def __init__(self, index_to_label_uri):
        """
        Construct a new "classifier" that applies labels to input vector
        indices.

        We expect to be given a URI to a new-line separated text file where each
        line is a separate label in order and matching the dimensionality of an
        input descriptor.

        :param index_to_label_uri: URI to new-line separated sequence of labels.
        :type index_to_label_uri: str

        """
        super(IndexLabelClassifier, self).__init__()

        # load label vector
        self.index_to_label_uri = index_to_label_uri
        self.label_vector = [
            line.strip()
            for line in from_uri(index_to_label_uri).to_buffered_reader()
        ]
Example #7
0
    def __init__(
        self,
        svm_model_uri=None,
        svm_label_map_uri=None,
        train_params={
            '-s': 0,  # C-SVC, assumed default if not provided
            '-t': 0,  # linear kernel
            '-b': 1,  # enable probability estimates
            '-c': 2,  # SVM parameter C
            # '-g': 0.0078125,  # initial gamma (1 / 128)
        },
        normalize=None,
        n_jobs=4,
    ):
        """
        Initialize the classifier with an empty or existing model.

        Model file paths are optional. If they are given and the file(s) exist,
        we will load them. If they do not, we treat the path(s) as the output
        path(s) for saving a model after calling ``train``. If this is None
        (default), no model is loaded nor output via training, thus any model
        trained will only exist in memory during the lifetime of this instance.

        :param svm_model_uri: Path to the libSVM model file.
        :type svm_model_uri: None | str

        :param svm_label_map_uri: Path to the pickle file containing this
            model's output labels.
        :type svm_label_map_uri: None | str

        :param train_params: SVM parameters used for training. See libSVM
            documentation for parameter flags and values.
        :type train_params: dict[basestring, int|float]

        :param normalize: Normalize input vectors to training and
            classification methods using ``numpy.linalg.norm``. This may either
            be  ``None``, disabling normalization, or any valid value that
            could be passed to the ``ord`` parameter in ``numpy.linalg.norm``
            for 1D arrays. This is ``None`` by default (no normalization).
        :type normalize: None | int | float | str

        :param int|None n_jobs:
            Number of processes to use to parallelize prediction. If None or a
            negative value, all cores are used.

        """
        super(LibSvmClassifier, self).__init__()

        self.svm_model_uri = svm_model_uri
        self.svm_label_map_uri = svm_label_map_uri

        # Elements will be None if input URI is None
        #: :type: None | smqtk.representation.DataElement
        self.svm_model_elem = \
            svm_model_uri and from_uri(svm_model_uri)
        #: :type: None | smqtk.representation.DataElement
        self.svm_label_map_elem = \
            svm_label_map_uri and from_uri(svm_label_map_uri)

        self.train_params = train_params
        self.normalize = normalize
        self.n_jobs = n_jobs
        # Validate normalization parameter by trying it on a random vector
        if normalize is not None:
            self._norm_vector(numpy.random.rand(8))

        # generated parameters
        #: :type: svm.svm_model
        self.svm_model = None
        # dictionary mapping SVM integer labels to semantic labels
        #: :type: dict[int, collections.abc.Hashable]
        self.svm_label_map = {}

        self._reload_model()
Example #8
0
    def __init__(
            self,
            index_uri: Optional[str] = None,
            parameters_uri: Optional[str] = None,
            descriptor_cache_uri: Optional[str] = None,
            # Parameters for building an index
            autotune: bool = False,
            target_precision: float = 0.95,
            sample_fraction: float = 0.1,
            distance_method: str = 'hik',
            random_seed: Optional[int] = None):
        """
        Initialize FLANN index properties. Does not contain a query-able index
        until one is built via the ``build_index`` method, or loaded from
        existing model files.

        When using this algorithm in a multiprocessing environment, the model
        file path parameters must be specified due to needing to reload the
        FLANN index on separate processes. This is because FLANN is in C and
        its instances are not copied into processes.

        Documentation on index building parameters and their meaning can be
        found in the FLANN documentation PDF:

            http://www.cs.ubc.ca/research/flann/uploads/FLANN/flann_manual-1.8.4.pdf

        See the MATLAB section for detailed descriptions (python section will
        just point you to the MATLAB section).

        :param index_uri: Optional URI to where to load/store FLANN index
            when initialized and/or built. If not configured, no model files
            are written to or loaded from disk.
        :param parameters_uri: Optional file location to load/save FLANN
            index parameters determined at build time. If not configured, no
            model files are written to or loaded from disk.
        :param descriptor_cache_uri: Optional file location to load/store
            DescriptorElements in this index. If not configured, no model files
            are written to or loaded from disk.
        :param autotune: Whether or not to perform parameter auto-tuning when
            building the index. If this is False, then the `target_precision`
            and `sample_fraction` parameters are not used.
        :param target_precision: Target estimation accuracy when determining
            nearest neighbor when tuning parameters. This should be between
            [0,1] and represents percentage accuracy.
        :param sample_fraction: Sub-sample percentage of the total index to use
            when performing auto-tuning. Value should be in the range of [0,1]
            and represents percentage.
        :param distance_method: Method label of the distance function to use.
            See FLANN documentation manual for available methods. Common
            methods include "hik", "chi_square" (default), and "euclidean".
            When loading and existing index, this value is ignored in
            preference for the distance method used to build the loaded index.
        :param random_seed: Integer to use as the random number generator seed.

        """
        warnings.warn(
            "This FLANN implementation is deprecated. Please utilize a more "
            "recent and supported plugin NearestNeighborsIndex, like the "
            "FaissNearestNeighborsIndex plugin.",
            category=DeprecationWarning)

        super(FlannNearestNeighborsIndex, self).__init__()

        self._index_uri = index_uri
        self._index_param_uri = parameters_uri
        self._descr_cache_uri = descriptor_cache_uri

        # Elements will be None if input URI is None
        self._index_elem = cast(Optional[DataElement], self._index_uri
                                and from_uri(self._index_uri))
        self._index_param_elem = cast(
            Optional[DataElement], self._index_param_uri
            and from_uri(self._index_param_uri))
        self._descr_cache_elem = cast(
            Optional[DataElement], self._descr_cache_uri
            and from_uri(self._descr_cache_uri))

        # parameters for building an index
        self._build_autotune = autotune
        self._build_target_precision = float(target_precision)
        self._build_sample_frac = float(sample_fraction)
        self._distance_method = str(distance_method)

        # Lock for model component access.  Using a multiprocessing due to
        # possible cases where another thread/process attempts to restore a
        # model before its fully written.  A reordering of _build_index could
        # lessen the requirement to a `threading.RLock`.
        self._model_lock = multiprocessing.RLock()

        # In-order cache of descriptors we're indexing over.
        # - flann.nn_index will spit out indices to list
        self._descr_cache: List[DescriptorElement] = []

        # The flann instance with a built index. None before index load/build.
        self._flann: pyflann.index.FLANN = None
        # Flann index parameters determined during building. None before index
        # load/build.
        self._flann_build_params = None

        #: :type: None | int
        self._rand_seed = None
        if random_seed:
            self._rand_seed = int(random_seed)

        # The process ID that the currently set FLANN instance was built/loaded
        # on. If this differs from the current process ID, the index should be
        # reloaded from cache.
        self._pid: Optional[int] = None

        # Load the index/parameters if one exists
        if self._has_model_data():
            LOG.info("Found existing model data. Loading.")
            self._load_flann_model()