def __setstate__(self, state): self.__dict__.update(state) self.svm_model_elem = \ self.svm_model_uri and from_uri(self.svm_model_uri) self.svm_label_map_elem = \ self.svm_label_map_uri and from_uri(self.svm_label_map_uri) # C libraries/pointers don't survive across processes. if '__LOCAL__' in state: # These would have gotten copied into dict during the updated. # The instance doesn't need to keep them around after this. del self.__dict__['__LOCAL__'] del self.__dict__['__LOCAL_LABELS__'] del self.__dict__['__LOCAL_MODEL__'] fd, fp = tempfile.mkstemp() try: os.close(fd) self.svm_label_map = state['__LOCAL_LABELS__'] # write model to file, then load via libSVM with open(fp, 'wb') as model_f: model_f.write(state['__LOCAL_MODEL__']) fp_bytes = fp.encode('utf8') self.svm_model = svmutil.svm_load_model(fp_bytes) finally: os.remove(fp) else: self.svm_model = None self._reload_model()
def test_from_uri_plugin_level(self) -> None: # will be absolute path test_file_path = os.path.join(TEST_DATA_DIR, "test_file.dat") print("Test file path:", test_file_path) e = from_uri(test_file_path) assert isinstance(e, DataFileElement) self.assertEqual(e._filepath, test_file_path) self.assertEqual(e.get_bytes(), b"") e = from_uri('file://' + test_file_path) assert isinstance(e, DataFileElement) self.assertEqual(e._filepath, test_file_path) self.assertEqual(e.get_bytes(), b"")
def __init__(self, index_to_label_uri: str): super().__init__() # load label vector self.index_to_label_uri = index_to_label_uri self.label_vector = [ line.strip() for line in from_uri(index_to_label_uri).to_buffered_reader() ]
def __init__( self, svm_model_uri: Optional[str] = None, svm_label_map_uri: Optional[str] = None, train_params: SVM_PARAM_MAPPING_T = { '-s': 0, # C-SVC, assumed default if not provided '-t': 0, # linear kernel '-b': 1, # enable probability estimates '-c': 2, # SVM parameter C # '-g': 0.0078125, # initial gamma (1 / 128) }, normalize: Optional[Union[int, float, str]] = None, n_jobs: Optional[int] = 4, ): super(LibSvmClassifier, self).__init__() self.svm_model_uri = svm_model_uri self.svm_label_map_uri = svm_label_map_uri # Elements will be None if input URI is None #: :type: None | smqtk.representation.DataElement self.svm_model_elem = \ svm_model_uri and from_uri(svm_model_uri) #: :type: None | smqtk.representation.DataElement self.svm_label_map_elem = \ svm_label_map_uri and from_uri(svm_label_map_uri) # Shallow copy to shield from modifying input. self.train_params = dict(train_params) self.normalize = normalize self.n_jobs = n_jobs # Validate normalization parameter by trying it on a random vector if normalize is not None: self._norm_vector(numpy.random.rand(8)) # generated parameters self.svm_model: Optional[svm.svm_model] = None # dictionary mapping SVM integer labels to semantic labels self.svm_label_map: Dict[int, Hashable] = {} self._reload_model()
def test_one_resolvable_option(self) -> None: """ When at least one plugin can resolve a URI """ def impl_generator() -> Iterable: return {UnresolvableElement, ResolvableElement} # URI that can be resolved by ResolvableElement self.assertIsInstance(from_uri("resolvable://data", impl_generator), ResolvableElement) # bad URI even though something can resolve it self.assertRaises(InvalidUriError, from_uri, 'not_resolvable', impl_generator)
def __init__(self, index_to_label_uri): """ Construct a new "classifier" that applies labels to input vector indices. We expect to be given a URI to a new-line separated text file where each line is a separate label in order and matching the dimensionality of an input descriptor. :param index_to_label_uri: URI to new-line separated sequence of labels. :type index_to_label_uri: str """ super(IndexLabelClassifier, self).__init__() # load label vector self.index_to_label_uri = index_to_label_uri self.label_vector = [ line.strip() for line in from_uri(index_to_label_uri).to_buffered_reader() ]
def __init__( self, svm_model_uri=None, svm_label_map_uri=None, train_params={ '-s': 0, # C-SVC, assumed default if not provided '-t': 0, # linear kernel '-b': 1, # enable probability estimates '-c': 2, # SVM parameter C # '-g': 0.0078125, # initial gamma (1 / 128) }, normalize=None, n_jobs=4, ): """ Initialize the classifier with an empty or existing model. Model file paths are optional. If they are given and the file(s) exist, we will load them. If they do not, we treat the path(s) as the output path(s) for saving a model after calling ``train``. If this is None (default), no model is loaded nor output via training, thus any model trained will only exist in memory during the lifetime of this instance. :param svm_model_uri: Path to the libSVM model file. :type svm_model_uri: None | str :param svm_label_map_uri: Path to the pickle file containing this model's output labels. :type svm_label_map_uri: None | str :param train_params: SVM parameters used for training. See libSVM documentation for parameter flags and values. :type train_params: dict[basestring, int|float] :param normalize: Normalize input vectors to training and classification methods using ``numpy.linalg.norm``. This may either be ``None``, disabling normalization, or any valid value that could be passed to the ``ord`` parameter in ``numpy.linalg.norm`` for 1D arrays. This is ``None`` by default (no normalization). :type normalize: None | int | float | str :param int|None n_jobs: Number of processes to use to parallelize prediction. If None or a negative value, all cores are used. """ super(LibSvmClassifier, self).__init__() self.svm_model_uri = svm_model_uri self.svm_label_map_uri = svm_label_map_uri # Elements will be None if input URI is None #: :type: None | smqtk.representation.DataElement self.svm_model_elem = \ svm_model_uri and from_uri(svm_model_uri) #: :type: None | smqtk.representation.DataElement self.svm_label_map_elem = \ svm_label_map_uri and from_uri(svm_label_map_uri) self.train_params = train_params self.normalize = normalize self.n_jobs = n_jobs # Validate normalization parameter by trying it on a random vector if normalize is not None: self._norm_vector(numpy.random.rand(8)) # generated parameters #: :type: svm.svm_model self.svm_model = None # dictionary mapping SVM integer labels to semantic labels #: :type: dict[int, collections.abc.Hashable] self.svm_label_map = {} self._reload_model()
def __init__( self, index_uri: Optional[str] = None, parameters_uri: Optional[str] = None, descriptor_cache_uri: Optional[str] = None, # Parameters for building an index autotune: bool = False, target_precision: float = 0.95, sample_fraction: float = 0.1, distance_method: str = 'hik', random_seed: Optional[int] = None): """ Initialize FLANN index properties. Does not contain a query-able index until one is built via the ``build_index`` method, or loaded from existing model files. When using this algorithm in a multiprocessing environment, the model file path parameters must be specified due to needing to reload the FLANN index on separate processes. This is because FLANN is in C and its instances are not copied into processes. Documentation on index building parameters and their meaning can be found in the FLANN documentation PDF: http://www.cs.ubc.ca/research/flann/uploads/FLANN/flann_manual-1.8.4.pdf See the MATLAB section for detailed descriptions (python section will just point you to the MATLAB section). :param index_uri: Optional URI to where to load/store FLANN index when initialized and/or built. If not configured, no model files are written to or loaded from disk. :param parameters_uri: Optional file location to load/save FLANN index parameters determined at build time. If not configured, no model files are written to or loaded from disk. :param descriptor_cache_uri: Optional file location to load/store DescriptorElements in this index. If not configured, no model files are written to or loaded from disk. :param autotune: Whether or not to perform parameter auto-tuning when building the index. If this is False, then the `target_precision` and `sample_fraction` parameters are not used. :param target_precision: Target estimation accuracy when determining nearest neighbor when tuning parameters. This should be between [0,1] and represents percentage accuracy. :param sample_fraction: Sub-sample percentage of the total index to use when performing auto-tuning. Value should be in the range of [0,1] and represents percentage. :param distance_method: Method label of the distance function to use. See FLANN documentation manual for available methods. Common methods include "hik", "chi_square" (default), and "euclidean". When loading and existing index, this value is ignored in preference for the distance method used to build the loaded index. :param random_seed: Integer to use as the random number generator seed. """ warnings.warn( "This FLANN implementation is deprecated. Please utilize a more " "recent and supported plugin NearestNeighborsIndex, like the " "FaissNearestNeighborsIndex plugin.", category=DeprecationWarning) super(FlannNearestNeighborsIndex, self).__init__() self._index_uri = index_uri self._index_param_uri = parameters_uri self._descr_cache_uri = descriptor_cache_uri # Elements will be None if input URI is None self._index_elem = cast(Optional[DataElement], self._index_uri and from_uri(self._index_uri)) self._index_param_elem = cast( Optional[DataElement], self._index_param_uri and from_uri(self._index_param_uri)) self._descr_cache_elem = cast( Optional[DataElement], self._descr_cache_uri and from_uri(self._descr_cache_uri)) # parameters for building an index self._build_autotune = autotune self._build_target_precision = float(target_precision) self._build_sample_frac = float(sample_fraction) self._distance_method = str(distance_method) # Lock for model component access. Using a multiprocessing due to # possible cases where another thread/process attempts to restore a # model before its fully written. A reordering of _build_index could # lessen the requirement to a `threading.RLock`. self._model_lock = multiprocessing.RLock() # In-order cache of descriptors we're indexing over. # - flann.nn_index will spit out indices to list self._descr_cache: List[DescriptorElement] = [] # The flann instance with a built index. None before index load/build. self._flann: pyflann.index.FLANN = None # Flann index parameters determined during building. None before index # load/build. self._flann_build_params = None #: :type: None | int self._rand_seed = None if random_seed: self._rand_seed = int(random_seed) # The process ID that the currently set FLANN instance was built/loaded # on. If this differs from the current process ID, the index should be # reloaded from cache. self._pid: Optional[int] = None # Load the index/parameters if one exists if self._has_model_data(): LOG.info("Found existing model data. Loading.") self._load_flann_model()