def check_memory(memory): """Check that ``memory`` is joblib.Memory-like. joblib.Memory-like means that ``memory`` can be converted into a joblib.Memory instance (typically a str denoting the ``location``) or has the same interface (has a ``cache`` method). Parameters ---------- memory : None, str or object with the joblib.Memory interface Returns ------- memory : object with the joblib.Memory interface Raises ------ ValueError If ``memory`` is not joblib.Memory-like. """ if memory is None or isinstance(memory, six.string_types): if LooseVersion(joblib_version) < '0.12': memory = Memory(cachedir=memory, verbose=0) else: memory = Memory(location=memory, verbose=0) elif not hasattr(memory, 'cache'): raise ValueError("'memory' should be None, a string or have the same" " interface as joblib.Memory." " Got memory='{}' instead.".format(memory)) return memory
def test_make_pipeline_memory(): cachedir = mkdtemp() if LooseVersion(joblib_version) < LooseVersion('0.12'): # Deal with change of API in joblib memory = Memory(cachedir=cachedir, verbose=10) else: memory = Memory(location=cachedir, verbose=10) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert pipeline.memory is memory pipeline = make_pipeline(DummyTransf(), SVC()) assert pipeline.memory is None shutil.rmtree(cachedir)
def test_pipeline_memory(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: if LooseVersion(joblib_version) < LooseVersion('0.12'): # Deal with change of API in joblib memory = Memory(cachedir=cachedir, verbose=10) else: memory = Memory(location=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the transformer in the cached pipeline ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_) # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert_equal(ts, cached_pipe_2.named_steps['transf_2'].timestamp_) finally: shutil.rmtree(cachedir)
def fetch_pins_people(resize=.5, min_faces_per_person=0, color=False, slice_=(slice(25, 275), slice(25, 275)), download_if_missing=True): """Load PINS dataset. Use a PINS dataset provided by Kaggle, everage the scikit-learn memory optimizations. Args: resize (float, optional): Image resize factor. Defaults to .5. min_faces_per_person (int, optional): Minimal number of images per person. Defaults to 0. color (bool): Toggle is images should be in RGB or 1 channel. Defaults to False. slice_ (tuple, optional): A rectangle to which images are sliced. Defaults to (slice(70, 195), slice(78, 172)). download_if_missing (bool, optional): Set if the dataset should be downloaded if not present on the machine. Defaults to True. Returns: sklearn.utils.Bunch: Collection of data set """ from kaggle import KaggleApi # Extract ZIP dataset kaggle_api = KaggleApi() kaggle_home = kaggle_api.read_config_file()['path'] path_to_zip = os.path.join(kaggle_home, 'datasets', PINS_DATASET['name'], PINS_DATASET['zip']) path_to_files = os.path.join(kaggle_home, 'datasets', PINS_DATASET['name'], PINS_DATASET['folder']) # Download if missing if download_if_missing and not os.path.exists(path_to_zip): kaggle_api.authenticate() kaggle_api.dataset_download_files(PINS_DATASET['name'], quiet=False) if not os.path.exists(path_to_files): with ZipFile(path_to_zip, 'r') as zipObj: extraction_path = os.path.join(kaggle_home, 'datasets', PINS_DATASET['name']) zipObj.extractall(extraction_path) # Load data in memory m = Memory(location=kaggle_home, compress=6, verbose=0) load_func = m.cache(_fetch_lfw_people) faces, target, target_names = load_func( path_to_files, resize=resize, min_faces_per_person=min_faces_per_person, color=color, slice_=slice_) X = faces.reshape(len(faces), -1) # Fix names with np.nditer(target_names, op_flags=['readwrite']) as it: for x in it: x[...] = np.core.defchararray.replace(x, 'pins ', '') x[...] = np.core.defchararray.replace(x, ' face', '') x[...] = np.core.defchararray.title(x) # pack the results as a Bunch instance return Bunch(data=X, images=faces, target=target, target_names=target_names)