Ejemplo n.º 1
0
    def fit(self, X, y):
        """Fit the model using X, y as training data.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data.
        y : array-like, shape = [n_samples]
            Target values. Will be cast to X's dtype if necessary
        Returns
        -------
        self : object
               Returns an instance of self.
        """
        X, y = check_X_y(X,
                         y, ['csr', 'csc'],
                         y_numeric=True,
                         ensure_min_samples=2,
                         estimator=self)
        X = as_float_array(X, copy=False)
        n_samples, n_features = X.shape

        X, y, X_offset, y_offset, X_scale = \
            self._preprocess_data(X, y, self.fit_intercept, self.normalize)

        estimator_func, params = self._make_estimator_and_params(X, y)
        memory = self.memory
        if memory is None:
            memory = Memory(cachedir=None, verbose=0)
        elif isinstance(memory, six.string_types):
            memory = Memory(cachedir=memory, verbose=0)
        elif not isinstance(memory, Memory):
            raise ValueError("'memory' should either be a string or"
                             " a sklearn.utils.Memory"
                             " instance, got 'memory={!r}' instead.".format(
                                 type(memory)))

        scores_ = memory.cache(_resample_model,
                               ignore=['verbose', 'n_jobs', 'pre_dispatch'
                                       ])(estimator_func,
                                          X,
                                          y,
                                          scaling=self.scaling,
                                          n_resampling=self.n_resampling,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          pre_dispatch=self.pre_dispatch,
                                          random_state=self.random_state,
                                          sample_fraction=self.sample_fraction,
                                          **params)

        if scores_.ndim == 1:
            scores_ = scores_[:, np.newaxis]
        self.all_scores_ = scores_
        self.scores_ = np.max(self.all_scores_, axis=1)
        return self
Ejemplo n.º 2
0
def test_make_pipeline_memory():
    cachedir = mkdtemp()
    if LooseVersion(joblib_version) < LooseVersion('0.12'):
        # Deal with change of API in joblib
        memory = Memory(cachedir=cachedir, verbose=10)
    else:
        memory = Memory(location=cachedir, verbose=10)
    pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
    assert_true(pipeline.memory is memory)
    pipeline = make_pipeline(DummyTransf(), SVC())
    assert_true(pipeline.memory is None)

    shutil.rmtree(cachedir)
Ejemplo n.º 3
0
def test_pipeline_memory():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma='scale', probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the transformer in the cached pipeline
        ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert_false(hasattr(transf, 'means_'))
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_)
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma='scale', probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert_equal(ts, cached_pipe_2.named_steps['transf_2'].timestamp_)
    finally:
        shutil.rmtree(cachedir)
Ejemplo n.º 4
0
def test_make_pipeline_memory():
    cachedir = mkdtemp()
    memory = Memory(cachedir=cachedir)
    pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
    assert_true(pipeline.memory is memory)
    pipeline = make_pipeline(DummyTransf(), SVC())
    assert_true(pipeline.memory is None)

    shutil.rmtree(cachedir)
Ejemplo n.º 5
0
for x in X:  # smooth data
    x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()
X -= X.mean(axis=0)
X /= X.std(axis=0)

y = np.dot(X, coef.ravel())
noise = np.random.randn(y.shape[0])
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
y += noise_coef * noise  # add noise

# #############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(2)  # cross-validation generator for model selection
ridge = BayesianRidge()
cachedir = tempfile.mkdtemp()
mem = Memory(cachedir=cachedir, verbose=1)

# Ward agglomeration followed by BayesianRidge
connectivity = grid_to_graph(n_x=size, n_y=size)
ward = FeatureAgglomeration(n_clusters=10,
                            connectivity=connectivity,
                            memory=mem)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
clf.fit(X, y)  # set the best parameters
coef_ = clf.best_estimator_.steps[-1][1].coef_
coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# Anova univariate feature selection followed by BayesianRidge
Ejemplo n.º 6
0
import matplotlib.pyplot as plt
import pandas

from sklearn.utils.testing import ignore_warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition.nmf import NMF
from sklearn.decomposition.nmf import _initialize_nmf
from sklearn.decomposition.nmf import _beta_divergence
from sklearn.decomposition.nmf import INTEGER_TYPES, _check_init
from sklearn.utils import Memory
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils.extmath import safe_sparse_dot, squared_norm
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted, check_non_negative

mem = Memory(cachedir='.', verbose=0)

###################
# Start of _PGNMF #
###################
# This class implements a projected gradient solver for the NMF.
# The projected gradient solver was removed from scikit-learn in version 0.19,
# and a simplified copy is used here for comparison purpose only.
# It is not tested, and it may change or disappear without notice.


def _norm(x):
    """Dot product-based Euclidean norm implementation
    See: http://fseoane.net/blog/2011/computing-the-vector-norm/
    """
    return np.sqrt(squared_norm(x))
Ejemplo n.º 7
0
import numpy as np

from sklearn.datasets import fetch_covtype, get_data_home
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import zero_one_loss
from sklearn.utils import Memory
from sklearn.utils import check_array

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'),
                mmap_mode='r')


@memory.cache
def load_data(dtype=np.float32, order='C', random_state=13):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True,
                         shuffle=True,
                         random_state=random_state)
    X = check_array(data['data'], dtype=dtype, order=order)
    y = (data['target'] != 1).astype(np.int)

    # Create train-test split (as [Joachims, 2006])
Ejemplo n.º 8
0
import numpy as np
from sklearn.utils import Memory
from sklearn.datasets import load_svmlight_file

mem = Memory("./mycache")

training_dataset = "./a9a/a9a"
testing_dataset  = "./a9a/a9a.t"

eps = 10e-3
lamb = 0.03

@mem.cache
def get_data(path):
    data = load_svmlight_file(path)
    X = data[0].todense()
    y = np.reshape(data[1], (-1, 1)) # make y a proper vector
    y[y == -1] = 0 # make sure labels are {0, 1}
    return X, y

def w_init(d):
    return np.zeros((d, 1))

def p(y, X, w):
    N, d = X.shape
    assert w.shape == (d, 1)
    assert y == 0 or y == 1
    return (np.exp(y * np.dot(X, w))) / (1 + np.exp((np.dot(X, w))))

def pred(X, w):
    N, d = X.shape
        embeddings_k2[:, 0],
        'y_k2':
        embeddings_k2[:, 1],
        'z_k2':
        embeddings_k2[:, 2]
    })

print('clusters')
import hdbscan
std_args = {
    'algorithm': 'best',
    'alpha': 1.0,
    'approx_min_span_tree': True,
    'gen_min_span_tree': False,
    'leaf_size': 40,
    'memory': Memory(cachedir=None),
    'metric': 'euclidean',
    'min_cluster_size': 15,
    'min_samples': None,
    'p': None
}


def cluster(embeddings_array, **kwargs):
    print('dimensionality', embeddings_array.shape)
    clusterer = hdbscan.HDBSCAN(**kwargs)
    clusterer.fit(np.array(embeddings_array))
    print('number of clusters: ', max(clusterer.labels_))
    return clusterer.labels_

resources = [
    (redirects_url, redirects_filename),
    (page_links_url, page_links_filename),
]

for url, filename in resources:
    if not os.path.exists(filename):
        print("Downloading data from '%s', please wait..." % url)
        opener = urlopen(url)
        open(filename, 'wb').write(opener.read())
        print()

# #############################################################################
# Loading the redirect files

memory = Memory(cachedir=".")


def index(redirects, index_map, k):
    """Find the index of an article name after redirect resolution"""
    k = redirects.get(k, k)
    return index_map.setdefault(k, len(index_map))


DBPEDIA_RESOURCE_PREFIX_LEN = len("http://dbpedia.org/resource/")
SHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1)


def short_name(nt_uri):
    """Remove the < and > URI markers and the common URI prefix"""
    return nt_uri[SHORTNAME_SLICE]
Ejemplo n.º 11
0
import json
import argparse

from sklearn.utils import Memory
from sklearn.datasets import fetch_mldata
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.utils import check_array
from sklearn.utils import shuffle as _shuffle

LOG_DIR = "mnist_tsne_output"
if not os.path.exists(LOG_DIR):
    os.mkdir(LOG_DIR)

memory = Memory(os.path.join(LOG_DIR, 'mnist_tsne_benchmark_data'),
                mmap_mode='r')


@memory.cache
def load_data(dtype=np.float32, order='C', shuffle=True, seed=0):
    """Load the data, then cache and memmap the train/test split"""
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')

    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    if shuffle:
        X, y = _shuffle(X, y, random_state=seed)

    # Normalize features
for x in X:  # smooth data
    x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()
X -= X.mean(axis=0)
X /= X.std(axis=0)

y = np.dot(X, coef.ravel())
noise = np.random.randn(y.shape[0])
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
y += noise_coef * noise  # add noise

# #############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(2)  # cross-validation generator for model selection
ridge = BayesianRidge()
cachedir = tempfile.mkdtemp()
mem = Memory(cachedir=cachedir, verbose=1)

# Ward agglomeration followed by BayesianRidge
connectivity = grid_to_graph(n_x=size, n_y=size)
ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity,
                            memory=mem)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
clf.fit(X, y)  # set the best parameters
coef_ = clf.best_estimator_.steps[-1][1].coef_
coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression)  # caching function
Ejemplo n.º 13
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.utils import Memory
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import zero_one_loss
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_array
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'),
                mmap_mode='r')


@memory.cache
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255
Ejemplo n.º 14
0
def build_model(cv=StratifiedKFold(3)):
    """Build a full nlp classification pipeline with GridSearchCV
       for best parameters.
    
    ARGUMENTS:
        X_train: training features (df or array)
        y_train: training labels (df or array)
        cv: type of CV, default is StratifiedKFold(3)
        
    RETURNS:
        cv: grid search object that can be fitted to the data to 
        transform it and find the best parameters.
    """

    # define classifier and scoring function
    clf = SGDClassifier(loss='log',
                        fit_intercept=False,
                        class_weight='balanced',
                        max_iter=5,
                        tol=None,
                        random_state=1,
                        n_jobs=-1)
    scorer = make_scorer(f1_score, average='weighted')

    # create temporary folder to store pipeline transformers (cache)
    cachedir = mkdtemp()
    memory = Memory(location=cachedir, verbose=1)

    full_pipe = Pipeline(
        [('features',
          FeatureUnion([
              ('text',
               Pipeline([
                   ('text_select', MessageSelector('message')),
                   ('vect', CountVectorizer(tokenizer=tokenize_text)),
                   ('tfidf', TfidfTransformer()),
               ])),
              ('genre',
               Pipeline([
                   ('cat_select', CategoricalsSelector('genre')),
                   ('ohe', OneHotEncoder()),
               ])),
          ])), ('clf', MultiOutputClassifier(clf))],
        memory=memory)

    parameters = {
        'features__text__vect__max_df': [0.8, 0.9],
    }

    # create grid search object
    cv = GridSearchCV(full_pipe,
                      param_grid=parameters,
                      scoring=scorer,
                      cv=cv,
                      error_score='raise',
                      n_jobs=1,
                      verbose=1)

    return cv

    # delete the temporary cache before exiting
    rmtree(cachedir)
Ejemplo n.º 15
0
from IPM import IPM
from sklearn.utils import Memory
from sklearn.datasets import load_svmlight_file
import os
import numpy as np
from scipy.sparse import coo_matrix,block_diag,eye,dia_matrix,csc_matrix,hstack,vstack
from scipy.sparse.linalg import spsolve
from numpy.linalg import norm
mem = Memory('./cache')

from IPM import IPM
from sklearn.utils import Memory
from sklearn.datasets import load_svmlight_file
import os
import numpy as np
from scipy.sparse import coo_matrix, block_diag, eye, dia_matrix, csc_matrix, hstack, vstack
from scipy.sparse.linalg import spsolve
from numpy.linalg import norm

mem = Memory('./cache')


class SVM(IPM):
    def __init__(self, tao):
        os.chdir('./IPM-SVM')
        self.tao = tao
        pass

    def load_data(self, filename):

        data = load_svmlight_file(filename)
# since it could be used again. Using a pipeline in ``GridSearchCV`` triggers
# such situations. Therefore, we use the argument ``memory`` to enable caching.
#
# .. warning::
#     Note that this example is, however, only an illustration since for this
#     specific case fitting PCA is not necessarily slower than loading the
#     cache. Hence, use the ``memory`` constructor parameter when the fitting
#     of a transformer is costly.

from tempfile import mkdtemp
from shutil import rmtree
from sklearn.utils import Memory

# Create a temporary folder to store the transformers of the pipeline
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)
cached_pipe = Pipeline([('reduce_dim', PCA()),
                        ('classify', LinearSVC())],
                       memory=memory)

# This time, a cached pipeline will be used within the grid search
grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid)
digits = load_digits()
grid.fit(digits.data, digits.target)

# Delete the temporary cache before exiting
rmtree(cachedir)

###############################################################################
# The ``PCA`` fitting is only computed at the evaluation of the first
# configuration of the ``C`` parameter of the ``LinearSVC`` classifier. The
Ejemplo n.º 17
0
def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20,
        dataset='rcv1', n_jobs=1, skip_slow=False):
    mem = Memory(cachedir=expanduser('~/cache'), verbose=0)

    if dataset == 'rcv1':
        rcv1 = fetch_rcv1()

        lbin = LabelBinarizer()
        lbin.fit(rcv1.target_names)

        X = rcv1.data
        y = rcv1.target
        y = lbin.inverse_transform(y)
        le = LabelEncoder()
        y = le.fit_transform(y)
        if single_target:
            y_n = y.copy()
            y_n[y > 16] = 1
            y_n[y <= 16] = 0
            y = y_n

    elif dataset == 'digits':
        digits = load_digits()
        X, y = digits.data, digits.target
        if single_target:
            y_n = y.copy()
            y_n[y < 5] = 1
            y_n[y >= 5] = 0
            y = y_n
    elif dataset == 'iris':
        iris = load_iris()
        X, y = iris.data, iris.target
    elif dataset == '20newspaper':
        ng = fetch_20newsgroups_vectorized()
        X = ng.data
        y = ng.target
        if single_target:
            y_n = y.copy()
            y_n[y > 4] = 1
            y_n[y <= 16] = 0
            y = y_n

    X = X[:n_samples]
    y = y[:n_samples]

    cached_fit = mem.cache(fit_single)
    out = Parallel(n_jobs=n_jobs, mmap_mode=None)(
        delayed(cached_fit)(solver, X, y,
                            penalty=penalty, single_target=single_target,
                            C=1, max_iter=max_iter, skip_slow=skip_slow)
        for solver in solvers
        for penalty in penalties)

    res = []
    idx = 0
    for solver in solvers:
        for penalty in penalties:
            if not (skip_slow and solver == 'lightning' and penalty == 'l1'):
                lr, times, train_scores, test_scores, accuracies = out[idx]
                this_res = dict(solver=solver, penalty=penalty,
                                single_target=single_target,
                                times=times, train_scores=train_scores,
                                test_scores=test_scores,
                                accuracies=accuracies)
                res.append(this_res)
            idx += 1

    with open('bench_saga.json', 'w+') as f:
        json.dump(res, f)