Beispiel #1
0
def check_memory(memory):
    """Check that ``memory`` is joblib.Memory-like.

    joblib.Memory-like means that ``memory`` can be converted into a
    joblib.Memory instance (typically a str denoting the ``location``)
    or has the same interface (has a ``cache`` method).

    Parameters
    ----------
    memory : None, str or object with the joblib.Memory interface

    Returns
    -------
    memory : object with the joblib.Memory interface

    Raises
    ------
    ValueError
        If ``memory`` is not joblib.Memory-like.
    """

    if memory is None or isinstance(memory, six.string_types):
        if LooseVersion(joblib_version) < '0.12':
            memory = Memory(cachedir=memory, verbose=0)
        else:
            memory = Memory(location=memory, verbose=0)
    elif not hasattr(memory, 'cache'):
        raise ValueError("'memory' should be None, a string or have the same"
                         " interface as joblib.Memory."
                         " Got memory='{}' instead.".format(memory))
    return memory
def test_make_pipeline_memory():
    cachedir = mkdtemp()
    if LooseVersion(joblib_version) < LooseVersion('0.12'):
        # Deal with change of API in joblib
        memory = Memory(cachedir=cachedir, verbose=10)
    else:
        memory = Memory(location=cachedir, verbose=10)
    pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
    assert pipeline.memory is memory
    pipeline = make_pipeline(DummyTransf(), SVC())
    assert pipeline.memory is None

    shutil.rmtree(cachedir)
Beispiel #3
0
def test_pipeline_memory():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        if LooseVersion(joblib_version) < LooseVersion('0.12'):
            # Deal with change of API in joblib
            memory = Memory(cachedir=cachedir, verbose=10)
        else:
            memory = Memory(location=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the transformer in the cached pipeline
        ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_)
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert_equal(ts, cached_pipe_2.named_steps['transf_2'].timestamp_)
    finally:
        shutil.rmtree(cachedir)
Beispiel #4
0
def fetch_pins_people(resize=.5,
                      min_faces_per_person=0,
                      color=False,
                      slice_=(slice(25, 275), slice(25, 275)),
                      download_if_missing=True):
    """Load PINS dataset.

    Use a PINS dataset provided by Kaggle, everage the scikit-learn memory
    optimizations.

    Args:
        resize (float, optional): Image resize factor. Defaults to .5.
        min_faces_per_person (int, optional): Minimal number of images per
            person. Defaults to 0.
        color (bool): Toggle is images should be in RGB or 1 channel.
            Defaults to False.
        slice_ (tuple, optional): A rectangle to which images are sliced.
            Defaults to (slice(70, 195), slice(78, 172)).
        download_if_missing (bool, optional): Set if the dataset should be
            downloaded if not present on the machine. Defaults to True.

    Returns:
        sklearn.utils.Bunch: Collection of data set
    """
    from kaggle import KaggleApi

    # Extract ZIP dataset
    kaggle_api = KaggleApi()
    kaggle_home = kaggle_api.read_config_file()['path']
    path_to_zip = os.path.join(kaggle_home, 'datasets', PINS_DATASET['name'],
                               PINS_DATASET['zip'])
    path_to_files = os.path.join(kaggle_home, 'datasets', PINS_DATASET['name'],
                                 PINS_DATASET['folder'])

    # Download if missing
    if download_if_missing and not os.path.exists(path_to_zip):
        kaggle_api.authenticate()
        kaggle_api.dataset_download_files(PINS_DATASET['name'], quiet=False)

    if not os.path.exists(path_to_files):
        with ZipFile(path_to_zip, 'r') as zipObj:
            extraction_path = os.path.join(kaggle_home, 'datasets',
                                           PINS_DATASET['name'])
            zipObj.extractall(extraction_path)

    # Load data in memory
    m = Memory(location=kaggle_home, compress=6, verbose=0)
    load_func = m.cache(_fetch_lfw_people)

    faces, target, target_names = load_func(
        path_to_files,
        resize=resize,
        min_faces_per_person=min_faces_per_person,
        color=color,
        slice_=slice_)

    X = faces.reshape(len(faces), -1)

    # Fix names
    with np.nditer(target_names, op_flags=['readwrite']) as it:
        for x in it:
            x[...] = np.core.defchararray.replace(x, 'pins ', '')
            x[...] = np.core.defchararray.replace(x, ' face', '')
            x[...] = np.core.defchararray.title(x)

    # pack the results as a Bunch instance
    return Bunch(data=X,
                 images=faces,
                 target=target,
                 target_names=target_names)