def setupJoblib(ipp_profile='default'):
    from sklearn.externals.joblib import Parallel, parallel_backend, register_parallel_backend
    
    import ipyparallel as ipp
    from ipyparallel.joblib import IPythonParallelBackend
    global joblib_rc,joblib_view,joblib_be
    joblib_rc = ipp.Client(profile=ipp_profile)
    joblib_view = joblib_rc.load_balanced_view()
    joblib_be = IPythonParallelBackend(view=joblib_view)
    
    register_parallel_backend('ipyparallel',lambda : joblib_be,make_default=True)
def tune_parameters_RL(X, estimator, non_negative=0,  distributed=0,
                       scheduler_host="", coeff_penalty_range=(0.0001, 1, 10),
                       fit_params={}, scoring_function=None,
                       random_state=None):
    """
    Parameters tuner.

    It tunes the parameters of a representations learning estimator using
    3-splits monte carlo sampling cross validation.

    Parameters
    ----------
    X: array-like, shape=(n_samples, n_features)
        The matrix to decompose and analyse.

    D: array-like, shape=(n_atoms, n_features)
        The dictionary.

    estimator: RepresentationLearning class, optional
        The estimator you want to use to analyse the matrix.

    non_negative: boolean, optional

    distributed: int, optional
        If 0 the parameters research will be executed in parallel on the
        computer the script is launched.
        If 1 the parameters research will be executed sequentially.
        If 2 the parameters research will be distributed on multiple machines
        connected by dask. In this case also scheduler_host must be speficied.

    scheduler_host: string, optional
        If distributed=2 it is necessary to specify the scheduler of the dask
        network. The string must be "ip_address:port", for example:
        "10.251.61.226:8786"

    coeff_penalty_range: float tuple, optional (low, high, number)
        It gives the interval in which tune the coefficient penalty and the
        number of values to try.

    fit_params: dictionary, optional
        The parameters to pass to the fitting procedure during GridSearch.

    scoring_function: callable or None, default=None
        A scorer callable object / function with signature
        scorer(estimator, X, y=None). If None, the score method of the
        estimator is used.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    GridSearchCV
    The resulting GridSearch.

    """

    # ------------------parameters control ---------------------------------- #
    X = check_array(X)
    random_state = check_random_state(random_state)
    _check_range(coeff_penalty_range)
    if estimator is None:
        logging.error("passed estimator was None")
        raise ValueError("passed estimator was None")
    _check_estimator(estimator)

    estimator.non_negativity = non_negative

    if distributed == 2:
        if scheduler_host is None:
            logging.ERROR("Distributed execution requires a scheduler "
                          "specification. Changing the type to parallel.")
            distributed = 1
        distributed = _check_scheduler(scheduler_host)

    ss = MonteCarloBootstrap(n_splits=3, test_size=0.1,
                                 random_state=random_state)

    params = _get_params_coeff(estimator, coeff_penalty_range,
                               representation_learning=1)

    jobs = 1 if distributed == 1 else cpu_count()
    gscv = GridSearchCV(estimator, params, cv=ss, n_jobs=(cpu_count() - 5),
                        fit_params=fit_params, iid=True, refit=True,
                        scoring=scoring_function, verbose=1)
    if distributed == 2:
        register_parallel_backend('distributed', DistributedBackend)
        with parallel_backend('distributed',
                              scheduler_host=scheduler_host):
            gscv.fit(X)
    else:
        gscv.fit(X)

    return gscv
def tune_parameters_DL(X, estimator=None, analysis=3, non_negative="none",
                       distributed=0, scheduler_host="", range_k=None,
                       dict_penalty_range=(0.0001, 1, 10),
                       coeff_penalty_range=(0.0001, 1, 10),
                       fit_params = {},
                       scoring_function=None,
                       random_state=None):
    """
    Parameters tuner.

    It tunes the parameters of a dictionary learning estimator using 3-splits
    monte carlo sampling cross validation.

    Parameters
    ----------
    X: array-like, shape=(n_samples, n_features)
        The matrix to decompose and analyse.

    estimator: DictionaryLearning class, optional
        The estimator you want to use to analyse the matrix. If None only the
        research on the best number of atoms will be done.

    analysis: int, optional
        The type of tuning you want to perform.
        - 0: tune together number of atoms and dictionary penalty and then the
             coefficients penalty
        - 1: tune only the penalties and take the number of atoms as specified
             in the estimator
        - 2: tune only the number of atoms
        - 3: tune all together, number of atoms and penalties

    non_negative: string, optional
        If "none" no negativity is imposed on the decomposition, if "coeff"
        only negativity on the coefficient is imposed. If "both" negativiy is
        on both decomposition matrices.

    distributed: int, optional
        If 0 the parameters research will be executed in parallel on the
        computer the script is launched.
        If 1 the parameters research will be executed sequentially.
        If 2 the parameters research will be distributed on multiple machines
        connected by dask. In this case also scheduler_host must be speficied.

    scheduler_host: string, optional
        If distributed=2 it is necessary to specify the scheduler of the dask
        network. The string must be "ip_address:port", for example:
        "10.251.61.226:8786"

    range_k: int or list, optional
        The maximum number of atoms to try when you search for the right k or
        the list of possible values to try.
        If None range_k will be computed as int(min(p, 0.75 * n) / 2)

    dict_penalty_range: float tuple, optional (low, high, number)
        It gives the interval in which tune the dictionary penalty and the
        number of values to try.

    coeff_penalty_range: float tuple, optional (low, high, number)
        It gives the interval in which tune the coefficient penalty and the
        number of values to try.

    fit_params: dictionary, optional
        The parameters to pass to the fitting procedure during GridSearch.

    scoring_function: callable or None, default=None
        A scorer callable object / function with signature
        scorer(estimator, X, y=None). If None, the score method of the
        estimator is used.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    GridSearchCV
    The resulting GridSearch.

    """

    # ------------------parameters control ---------------------------------- #
    X = check_array(X)
    random_state = check_random_state(random_state)
    _check_range(dict_penalty_range)
    _check_range(coeff_penalty_range)
    _check_non_negativity(non_negative, X)

    if estimator is None:
        analysis = 2
    else:
        _check_estimator(estimator)
        if estimator.non_negativity == "none":
            estimator.non_negativity = non_negative

    n, p = X.shape
    if range_k is None:
        range_k = int(min(p, 0.75 * n) / 2)  # generally the optimal
                                           # number of k is low

    if (analysis in [0, 1, 3] and
       (dict_penalty_range is None or coeff_penalty_range is None)):
        logging.ERROR("The range cannot be None")
        sys.exit(0)

    if distributed == 2:
        if scheduler_host is None:
            logging.ERROR("Distributed execution requires a scheduler "
                          "specification. Changing the type to parallel.")
            distributed = 1
        distributed = _check_scheduler(scheduler_host)

    # find first the paramaters on the dictionary and after the coefficients
    if analysis == 0:
        params = _get_params_dict(estimator,
                                  dict_penalty_range=dict_penalty_range)
        if type(range_k) is int:
            params['k'] = list(range(2, range_k))
        else:
            params['k'] = range_k

        jobs = 1 if distributed == 1 else cpu_count()
        gscv = GridSearchCV(estimator, params, cv=ss, n_jobs=jobs,
                            scoring=scoring_function,
                            iid=True, refit=True, verbose=1)
        if distributed == 2:
            register_parallel_backend('distributed', DistributedBackend)
            with parallel_backend('distributed', scheduler_host=scheduler_host):
                gscv.fit(X)
        else:
            gscv.fit(X)
        estimator = gscv.best_estimator_
        params = _get_params_coeff(estimator, coeff_penalty_range)
    # find only the penalties together
    elif analysis == 1:
        params = _get_params(estimator, dict_penalty_range,
                             coeff_penalty_range)
    # find only the number of atoms
    elif analysis == 2:
        if type(range_k) is int:
            params = {'k': list(range(2, max_k))}
        else:
            params = {'k': range_k}
    # find everything together
    elif analysis == 3:
        params = _get_params(estimator, dict_penalty_range,
                             coeff_penalty_range)

        if type(range_k) is int:
            params['k'] = list(range(2, range_k))
        else:
            params['k'] = range_k
    else:
        logging.error("Unknown type of research, please try with another "
                      "setting")
        raise ValueError("Unkown type of research, please try with another"
                         "setting")

    ss = MonteCarloBootstrap(n_splits=3, test_size=0.1,
                             random_state=random_state)
    jobs = 1 if distributed == 1 else cpu_count()
    gscv = GridSearchCV(estimator, params, cv=ss, fit_params=fit_params,
                        n_jobs=jobs, iid=True, scoring=scoring_function,
                        refit=True, verbose=1)
    if distributed == 2:
        register_parallel_backend('distributed', DistributedBackend)
        with parallel_backend('distributed',
                              scheduler_host=scheduler_host):
            gscv.fit(X)
    else:
        gscv.fit(X)
    return gscv
Example #4
0
        self.futures.add(future)

        @gen.coroutine
        def callback_wrapper():
            result = yield _wait([future])
            self.futures.remove(future)
            callback(result)  # gets called in separate thread

        self.client.loop.add_callback(callback_wrapper)

        future.get = future.result  # monkey patch to achieve AsyncResult API
        return future

    def abort_everything(self, ensure_ready=True):
        # Tell the client to cancel any task submitted via this instance
        # as joblib.Parallel will never access those results.
        self.client.cancel(self.futures)
        self.futures.clear()


DistributedBackend = DaskDistributedBackend


# Register the backend with any available versions of joblib
if joblib:
    joblib.register_parallel_backend('distributed', DaskDistributedBackend)
    joblib.register_parallel_backend('dask.distributed', DaskDistributedBackend)
if sk_joblib:
    sk_joblib.register_parallel_backend('distributed', DaskDistributedBackend)
    sk_joblib.register_parallel_backend('dask.distributed', DaskDistributedBackend)
Example #5
0
args = parser.parse_args()
profile = args.profile
logging.basicConfig(filename=os.path.join(FILE_DIR,profile+'.log'),
                    filemode='w',
                    level=logging.DEBUG)
logging.info("number of CPUs found: {0}".format(cpu_count()))
logging.info("args.profile: {0}".format(profile))

#prepare the engines
c = Client(profile=profile)
#The following command will make sure that each engine is running in
# the right working directory to access the custom function(s).
c[:].map(os.chdir, [FILE_DIR]*len(c))
logging.info("c.ids :{0}".format(str(c.ids)))
bview = c.load_balanced_view()
register_parallel_backend('ipyparallel',
                          lambda : IPythonParallelBackend(view=bview))

#Get data
digits = load_digits()
#prepare it for the custom function
#it would be better to use cross-validation
#outside the scope of this tutorial
X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                    digits.target,
                                                    test_size=0.3)
#some parameters to test in parallel
param_space = {
    'C': np.logspace(-6, 6, 20),
    'gamma': np.logspace(-6,1,20)
}
Example #6
0
    from sklearn.externals.joblib import Parallel, parallel_backend, register_parallel_backend

    import ipyparallel as ipp
    from ipyparallel import Client
    from ipyparallel.joblib import IPythonParallelBackend
    global joblib_rc, joblib_view, joblib_be
    joblib_rc = ipp.Client(profile=options.cluster)
    targets = None
    if options.cluster_nodes is not None:
        targets = [int(x) for x in options.cluster_nodes.split(",") if x != ""]
    joblib_view = joblib_rc.load_balanced_view(targets=targets)
    njobs = len(joblib_view)
    joblib_be = IPythonParallelBackend(view=joblib_view)

    register_parallel_backend('ipyparallel',
                              lambda: joblib_be,
                              make_default=True)
    print('will run %d jobs on %s (targets %s)' %
          (njobs, options.cluster, targets))
    print('\n')

# get features and target
X = df[features]  #.values
y = df['label']  #.values
w = df['wgt']  #.values

# instantiate classifier
from xgboost import XGBClassifier

clf = XGBClassifier(**options.clf_params)
Example #7
0
        self.futures.add(future)

        @gen.coroutine
        def callback_wrapper():
            result = yield _wait([future])
            self.futures.remove(future)
            callback(result)  # gets called in separate thread

        self.client.loop.add_callback(callback_wrapper)

        future.get = future.result  # monkey patch to achieve AsyncResult API
        return future

    def abort_everything(self, ensure_ready=True):
        # Tell the client to cancel any task submitted via this instance
        # as joblib.Parallel will never access those results.
        self.client.cancel(self.futures)
        self.futures.clear()


DistributedBackend = DaskDistributedBackend


# Register the backend with any available versions of joblib
if joblib:
    joblib.register_parallel_backend("distributed", DaskDistributedBackend)
    joblib.register_parallel_backend("dask.distributed", DaskDistributedBackend)
if sk_joblib:
    sk_joblib.register_parallel_backend("distributed", DaskDistributedBackend)
    sk_joblib.register_parallel_backend("dask.distributed", DaskDistributedBackend)
Example #8
0
    def apply_async(self, func, *args, **kwargs):
        callback = kwargs.pop('callback', None)
        kwargs['pure'] = False
        future = self.executor.submit(func, *args, **kwargs)
        self.futures.add(future)

        @gen.coroutine
        def callback_wrapper():
            result = yield _wait([future])
            self.futures.remove(future)
            callback(result)  # gets called in separate thread

        self.executor.loop.add_callback(callback_wrapper)

        future.get = future.result  # monkey patch to achieve AsyncResult API
        return future

    def abort_everything(self, ensure_ready=True):
        # Tell the executor to cancel any task submitted via this instance
        # as joblib.Parallel will never access those results.
        self.executor.cancel(self.futures)
        self.futures.clear()


# Register the backend with any available versions of joblib
if joblib:
    joblib.register_parallel_backend('distributed', DistributedBackend)
if sk_joblib:
    sk_joblib.register_parallel_backend('distributed', DistributedBackend)
Example #9
0
def run_task(seed, task_id, estimator_name, n_iter, n_jobs, n_folds_inner_cv,
             profile, joblib_tmp_dir, run_tmp_dir):

    # retrieve dataset / task
    task = openml.tasks.get_task(task_id)
    num_features = task.get_X_and_y()[0].shape[1]
    indices = task.get_dataset().get_features_by_type('nominal',
                                                      [task.target_name])

    # retrieve classifier
    classifierfactory = openmlstudy14.pipeline.EstimatorFactory(
        n_folds_inner_cv, n_iter, n_jobs)
    estimator = classifierfactory.get_flow_mapping()[estimator_name](
        indices, num_features=num_features)

    print('Running task with ID %d.' % task_id)
    print('Arguments: random search iterations: %d, inner CV folds %d, '
          'n parallel jobs: %d, seed %d' %
          (n_iter, n_folds_inner_cv, n_jobs, seed))
    print('Model: %s' % str(estimator))
    flow = openml.flows.sklearn_to_flow(estimator)
    flow.tags.append('study_14')

    import time
    start_time = time.time()

    # TODO generate a flow first
    if profile is None:
        import warnings
        with warnings.catch_warnings():
            warnings.filterwarnings(
                'ignore', module='sklearn\.externals\.joblib\.parallel')
            run = openml.runs.run_flow_on_task(task, flow, seed=seed)
    else:
        print('Using ipython parallel with scheduler file %s' % profile)

        for i in range(1000):
            profile_file = os.path.join(os.path.expanduser('~'), '.ipython',
                                        'profile_%s' % profile, 'security',
                                        'ipcontroller-engine.json')
            try:
                with open(profile_file) as fh:
                    scheduler_information = yaml.load(fh)
                break
            except FileNotFoundError:
                print('scheduler file %s not found. sleeping ... zzz' %
                      profile_file)
                time.sleep(1)
                continue

        c = Client(profile=profile)
        bview = c.load_balanced_view()
        register_parallel_backend(
            'ipyparallel',
            lambda: NPCachingIpyParallelBackend(view=bview,
                                                tmp_dir=joblib_tmp_dir))

        with parallel_backend('ipyparallel'):
            run = openml.runs.run_flow_on_task(task, flow, seed=seed)

    end_time = time.time()
    run.tags.append('study_14')

    tmp_dir = os.path.join(run_tmp_dir,
                           '%s_%s' % (str(task_id), estimator_name))
    print(tmp_dir)
    try:
        os.makedirs(tmp_dir)
    except Exception as e:
        print(e)
    run_xml = run._create_description_xml()
    predictions_arff = arff.dumps(run._generate_arff_dict())

    with open(tmp_dir + '/run.xml', 'w') as f:
        f.write(run_xml)
    with open(tmp_dir + '/predictions.arff', 'w') as f:
        f.write(predictions_arff)

    run_prime = run.publish()
    print('READTHIS', estimator_name, task_id, run_prime.run_id,
          end_time - start_time)

    return run
Example #10
0
from distributed.joblib import DistributedBackend

# it is important to import joblib from sklearn if we want the distributed features to work with sklearn!
from sklearn.externals.joblib import Parallel, parallel_backend, register_parallel_backend

...

search = RandomizedSearchCV(model, param_space, cv=10, n_iter=1000, verbose=1)

register_parallel_backend('distributed', DistributedBackend)

with parallel_backend('distributed',
                      scheduler_host='your_scheduler_host:your_port'):
    search.fit(digits.data, digits.target)
Example #11
0
def use_dill_mp_backend():
    register_parallel_backend('multiprocessing',
                              MultiprocessingBackendDill,
                              make_default=True)
Example #12
0
        def apply_async(self, batch, callback=None):
            """Schedule a func to be run"""
            sig = joblib_hash(batch)
            result = self.result_dict.get(sig)
            if result is None:
                self.job_list.append((sig, batch))
                return JoblibDispatch(self)
            return JoblibResult(result, callback)

        def configure(self, n_jobs=1, parallel=None, **backend_args):
            """Reconfigure the backend and return the number of workers. This
      makes it possible to reuse an existing backend instance for successive
      independent calls to Parallel with different parameters."""

            if n_jobs == 1:
                raise FallbackToBackend(SequentialBackend())

            self.parallel = parallel
            return self.effective_n_jobs(n_jobs)

        def abort_everything(self, ensure_ready=True):
            # All jobs will be aborted here while they are still processing our backend
            if ensure_ready:
                self.configure(n_jobs=self.parallel.n_jobs,
                               parallel=self.parallel,
                               **self.parallel._backend_args)
            return

    register_parallel_backend('CMFActivity', CMFActivityBackend)