def setupJoblib(ipp_profile='default'): from sklearn.externals.joblib import Parallel, parallel_backend, register_parallel_backend import ipyparallel as ipp from ipyparallel.joblib import IPythonParallelBackend global joblib_rc,joblib_view,joblib_be joblib_rc = ipp.Client(profile=ipp_profile) joblib_view = joblib_rc.load_balanced_view() joblib_be = IPythonParallelBackend(view=joblib_view) register_parallel_backend('ipyparallel',lambda : joblib_be,make_default=True)
def tune_parameters_RL(X, estimator, non_negative=0, distributed=0, scheduler_host="", coeff_penalty_range=(0.0001, 1, 10), fit_params={}, scoring_function=None, random_state=None): """ Parameters tuner. It tunes the parameters of a representations learning estimator using 3-splits monte carlo sampling cross validation. Parameters ---------- X: array-like, shape=(n_samples, n_features) The matrix to decompose and analyse. D: array-like, shape=(n_atoms, n_features) The dictionary. estimator: RepresentationLearning class, optional The estimator you want to use to analyse the matrix. non_negative: boolean, optional distributed: int, optional If 0 the parameters research will be executed in parallel on the computer the script is launched. If 1 the parameters research will be executed sequentially. If 2 the parameters research will be distributed on multiple machines connected by dask. In this case also scheduler_host must be speficied. scheduler_host: string, optional If distributed=2 it is necessary to specify the scheduler of the dask network. The string must be "ip_address:port", for example: "10.251.61.226:8786" coeff_penalty_range: float tuple, optional (low, high, number) It gives the interval in which tune the coefficient penalty and the number of values to try. fit_params: dictionary, optional The parameters to pass to the fitting procedure during GridSearch. scoring_function: callable or None, default=None A scorer callable object / function with signature scorer(estimator, X, y=None). If None, the score method of the estimator is used. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- GridSearchCV The resulting GridSearch. """ # ------------------parameters control ---------------------------------- # X = check_array(X) random_state = check_random_state(random_state) _check_range(coeff_penalty_range) if estimator is None: logging.error("passed estimator was None") raise ValueError("passed estimator was None") _check_estimator(estimator) estimator.non_negativity = non_negative if distributed == 2: if scheduler_host is None: logging.ERROR("Distributed execution requires a scheduler " "specification. Changing the type to parallel.") distributed = 1 distributed = _check_scheduler(scheduler_host) ss = MonteCarloBootstrap(n_splits=3, test_size=0.1, random_state=random_state) params = _get_params_coeff(estimator, coeff_penalty_range, representation_learning=1) jobs = 1 if distributed == 1 else cpu_count() gscv = GridSearchCV(estimator, params, cv=ss, n_jobs=(cpu_count() - 5), fit_params=fit_params, iid=True, refit=True, scoring=scoring_function, verbose=1) if distributed == 2: register_parallel_backend('distributed', DistributedBackend) with parallel_backend('distributed', scheduler_host=scheduler_host): gscv.fit(X) else: gscv.fit(X) return gscv
def tune_parameters_DL(X, estimator=None, analysis=3, non_negative="none", distributed=0, scheduler_host="", range_k=None, dict_penalty_range=(0.0001, 1, 10), coeff_penalty_range=(0.0001, 1, 10), fit_params = {}, scoring_function=None, random_state=None): """ Parameters tuner. It tunes the parameters of a dictionary learning estimator using 3-splits monte carlo sampling cross validation. Parameters ---------- X: array-like, shape=(n_samples, n_features) The matrix to decompose and analyse. estimator: DictionaryLearning class, optional The estimator you want to use to analyse the matrix. If None only the research on the best number of atoms will be done. analysis: int, optional The type of tuning you want to perform. - 0: tune together number of atoms and dictionary penalty and then the coefficients penalty - 1: tune only the penalties and take the number of atoms as specified in the estimator - 2: tune only the number of atoms - 3: tune all together, number of atoms and penalties non_negative: string, optional If "none" no negativity is imposed on the decomposition, if "coeff" only negativity on the coefficient is imposed. If "both" negativiy is on both decomposition matrices. distributed: int, optional If 0 the parameters research will be executed in parallel on the computer the script is launched. If 1 the parameters research will be executed sequentially. If 2 the parameters research will be distributed on multiple machines connected by dask. In this case also scheduler_host must be speficied. scheduler_host: string, optional If distributed=2 it is necessary to specify the scheduler of the dask network. The string must be "ip_address:port", for example: "10.251.61.226:8786" range_k: int or list, optional The maximum number of atoms to try when you search for the right k or the list of possible values to try. If None range_k will be computed as int(min(p, 0.75 * n) / 2) dict_penalty_range: float tuple, optional (low, high, number) It gives the interval in which tune the dictionary penalty and the number of values to try. coeff_penalty_range: float tuple, optional (low, high, number) It gives the interval in which tune the coefficient penalty and the number of values to try. fit_params: dictionary, optional The parameters to pass to the fitting procedure during GridSearch. scoring_function: callable or None, default=None A scorer callable object / function with signature scorer(estimator, X, y=None). If None, the score method of the estimator is used. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- GridSearchCV The resulting GridSearch. """ # ------------------parameters control ---------------------------------- # X = check_array(X) random_state = check_random_state(random_state) _check_range(dict_penalty_range) _check_range(coeff_penalty_range) _check_non_negativity(non_negative, X) if estimator is None: analysis = 2 else: _check_estimator(estimator) if estimator.non_negativity == "none": estimator.non_negativity = non_negative n, p = X.shape if range_k is None: range_k = int(min(p, 0.75 * n) / 2) # generally the optimal # number of k is low if (analysis in [0, 1, 3] and (dict_penalty_range is None or coeff_penalty_range is None)): logging.ERROR("The range cannot be None") sys.exit(0) if distributed == 2: if scheduler_host is None: logging.ERROR("Distributed execution requires a scheduler " "specification. Changing the type to parallel.") distributed = 1 distributed = _check_scheduler(scheduler_host) # find first the paramaters on the dictionary and after the coefficients if analysis == 0: params = _get_params_dict(estimator, dict_penalty_range=dict_penalty_range) if type(range_k) is int: params['k'] = list(range(2, range_k)) else: params['k'] = range_k jobs = 1 if distributed == 1 else cpu_count() gscv = GridSearchCV(estimator, params, cv=ss, n_jobs=jobs, scoring=scoring_function, iid=True, refit=True, verbose=1) if distributed == 2: register_parallel_backend('distributed', DistributedBackend) with parallel_backend('distributed', scheduler_host=scheduler_host): gscv.fit(X) else: gscv.fit(X) estimator = gscv.best_estimator_ params = _get_params_coeff(estimator, coeff_penalty_range) # find only the penalties together elif analysis == 1: params = _get_params(estimator, dict_penalty_range, coeff_penalty_range) # find only the number of atoms elif analysis == 2: if type(range_k) is int: params = {'k': list(range(2, max_k))} else: params = {'k': range_k} # find everything together elif analysis == 3: params = _get_params(estimator, dict_penalty_range, coeff_penalty_range) if type(range_k) is int: params['k'] = list(range(2, range_k)) else: params['k'] = range_k else: logging.error("Unknown type of research, please try with another " "setting") raise ValueError("Unkown type of research, please try with another" "setting") ss = MonteCarloBootstrap(n_splits=3, test_size=0.1, random_state=random_state) jobs = 1 if distributed == 1 else cpu_count() gscv = GridSearchCV(estimator, params, cv=ss, fit_params=fit_params, n_jobs=jobs, iid=True, scoring=scoring_function, refit=True, verbose=1) if distributed == 2: register_parallel_backend('distributed', DistributedBackend) with parallel_backend('distributed', scheduler_host=scheduler_host): gscv.fit(X) else: gscv.fit(X) return gscv
self.futures.add(future) @gen.coroutine def callback_wrapper(): result = yield _wait([future]) self.futures.remove(future) callback(result) # gets called in separate thread self.client.loop.add_callback(callback_wrapper) future.get = future.result # monkey patch to achieve AsyncResult API return future def abort_everything(self, ensure_ready=True): # Tell the client to cancel any task submitted via this instance # as joblib.Parallel will never access those results. self.client.cancel(self.futures) self.futures.clear() DistributedBackend = DaskDistributedBackend # Register the backend with any available versions of joblib if joblib: joblib.register_parallel_backend('distributed', DaskDistributedBackend) joblib.register_parallel_backend('dask.distributed', DaskDistributedBackend) if sk_joblib: sk_joblib.register_parallel_backend('distributed', DaskDistributedBackend) sk_joblib.register_parallel_backend('dask.distributed', DaskDistributedBackend)
args = parser.parse_args() profile = args.profile logging.basicConfig(filename=os.path.join(FILE_DIR,profile+'.log'), filemode='w', level=logging.DEBUG) logging.info("number of CPUs found: {0}".format(cpu_count())) logging.info("args.profile: {0}".format(profile)) #prepare the engines c = Client(profile=profile) #The following command will make sure that each engine is running in # the right working directory to access the custom function(s). c[:].map(os.chdir, [FILE_DIR]*len(c)) logging.info("c.ids :{0}".format(str(c.ids))) bview = c.load_balanced_view() register_parallel_backend('ipyparallel', lambda : IPythonParallelBackend(view=bview)) #Get data digits = load_digits() #prepare it for the custom function #it would be better to use cross-validation #outside the scope of this tutorial X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3) #some parameters to test in parallel param_space = { 'C': np.logspace(-6, 6, 20), 'gamma': np.logspace(-6,1,20) }
from sklearn.externals.joblib import Parallel, parallel_backend, register_parallel_backend import ipyparallel as ipp from ipyparallel import Client from ipyparallel.joblib import IPythonParallelBackend global joblib_rc, joblib_view, joblib_be joblib_rc = ipp.Client(profile=options.cluster) targets = None if options.cluster_nodes is not None: targets = [int(x) for x in options.cluster_nodes.split(",") if x != ""] joblib_view = joblib_rc.load_balanced_view(targets=targets) njobs = len(joblib_view) joblib_be = IPythonParallelBackend(view=joblib_view) register_parallel_backend('ipyparallel', lambda: joblib_be, make_default=True) print('will run %d jobs on %s (targets %s)' % (njobs, options.cluster, targets)) print('\n') # get features and target X = df[features] #.values y = df['label'] #.values w = df['wgt'] #.values # instantiate classifier from xgboost import XGBClassifier clf = XGBClassifier(**options.clf_params)
self.futures.add(future) @gen.coroutine def callback_wrapper(): result = yield _wait([future]) self.futures.remove(future) callback(result) # gets called in separate thread self.client.loop.add_callback(callback_wrapper) future.get = future.result # monkey patch to achieve AsyncResult API return future def abort_everything(self, ensure_ready=True): # Tell the client to cancel any task submitted via this instance # as joblib.Parallel will never access those results. self.client.cancel(self.futures) self.futures.clear() DistributedBackend = DaskDistributedBackend # Register the backend with any available versions of joblib if joblib: joblib.register_parallel_backend("distributed", DaskDistributedBackend) joblib.register_parallel_backend("dask.distributed", DaskDistributedBackend) if sk_joblib: sk_joblib.register_parallel_backend("distributed", DaskDistributedBackend) sk_joblib.register_parallel_backend("dask.distributed", DaskDistributedBackend)
def apply_async(self, func, *args, **kwargs): callback = kwargs.pop('callback', None) kwargs['pure'] = False future = self.executor.submit(func, *args, **kwargs) self.futures.add(future) @gen.coroutine def callback_wrapper(): result = yield _wait([future]) self.futures.remove(future) callback(result) # gets called in separate thread self.executor.loop.add_callback(callback_wrapper) future.get = future.result # monkey patch to achieve AsyncResult API return future def abort_everything(self, ensure_ready=True): # Tell the executor to cancel any task submitted via this instance # as joblib.Parallel will never access those results. self.executor.cancel(self.futures) self.futures.clear() # Register the backend with any available versions of joblib if joblib: joblib.register_parallel_backend('distributed', DistributedBackend) if sk_joblib: sk_joblib.register_parallel_backend('distributed', DistributedBackend)
def run_task(seed, task_id, estimator_name, n_iter, n_jobs, n_folds_inner_cv, profile, joblib_tmp_dir, run_tmp_dir): # retrieve dataset / task task = openml.tasks.get_task(task_id) num_features = task.get_X_and_y()[0].shape[1] indices = task.get_dataset().get_features_by_type('nominal', [task.target_name]) # retrieve classifier classifierfactory = openmlstudy14.pipeline.EstimatorFactory( n_folds_inner_cv, n_iter, n_jobs) estimator = classifierfactory.get_flow_mapping()[estimator_name]( indices, num_features=num_features) print('Running task with ID %d.' % task_id) print('Arguments: random search iterations: %d, inner CV folds %d, ' 'n parallel jobs: %d, seed %d' % (n_iter, n_folds_inner_cv, n_jobs, seed)) print('Model: %s' % str(estimator)) flow = openml.flows.sklearn_to_flow(estimator) flow.tags.append('study_14') import time start_time = time.time() # TODO generate a flow first if profile is None: import warnings with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', module='sklearn\.externals\.joblib\.parallel') run = openml.runs.run_flow_on_task(task, flow, seed=seed) else: print('Using ipython parallel with scheduler file %s' % profile) for i in range(1000): profile_file = os.path.join(os.path.expanduser('~'), '.ipython', 'profile_%s' % profile, 'security', 'ipcontroller-engine.json') try: with open(profile_file) as fh: scheduler_information = yaml.load(fh) break except FileNotFoundError: print('scheduler file %s not found. sleeping ... zzz' % profile_file) time.sleep(1) continue c = Client(profile=profile) bview = c.load_balanced_view() register_parallel_backend( 'ipyparallel', lambda: NPCachingIpyParallelBackend(view=bview, tmp_dir=joblib_tmp_dir)) with parallel_backend('ipyparallel'): run = openml.runs.run_flow_on_task(task, flow, seed=seed) end_time = time.time() run.tags.append('study_14') tmp_dir = os.path.join(run_tmp_dir, '%s_%s' % (str(task_id), estimator_name)) print(tmp_dir) try: os.makedirs(tmp_dir) except Exception as e: print(e) run_xml = run._create_description_xml() predictions_arff = arff.dumps(run._generate_arff_dict()) with open(tmp_dir + '/run.xml', 'w') as f: f.write(run_xml) with open(tmp_dir + '/predictions.arff', 'w') as f: f.write(predictions_arff) run_prime = run.publish() print('READTHIS', estimator_name, task_id, run_prime.run_id, end_time - start_time) return run
from distributed.joblib import DistributedBackend # it is important to import joblib from sklearn if we want the distributed features to work with sklearn! from sklearn.externals.joblib import Parallel, parallel_backend, register_parallel_backend ... search = RandomizedSearchCV(model, param_space, cv=10, n_iter=1000, verbose=1) register_parallel_backend('distributed', DistributedBackend) with parallel_backend('distributed', scheduler_host='your_scheduler_host:your_port'): search.fit(digits.data, digits.target)
def use_dill_mp_backend(): register_parallel_backend('multiprocessing', MultiprocessingBackendDill, make_default=True)
def apply_async(self, batch, callback=None): """Schedule a func to be run""" sig = joblib_hash(batch) result = self.result_dict.get(sig) if result is None: self.job_list.append((sig, batch)) return JoblibDispatch(self) return JoblibResult(result, callback) def configure(self, n_jobs=1, parallel=None, **backend_args): """Reconfigure the backend and return the number of workers. This makes it possible to reuse an existing backend instance for successive independent calls to Parallel with different parameters.""" if n_jobs == 1: raise FallbackToBackend(SequentialBackend()) self.parallel = parallel return self.effective_n_jobs(n_jobs) def abort_everything(self, ensure_ready=True): # All jobs will be aborted here while they are still processing our backend if ensure_ready: self.configure(n_jobs=self.parallel.n_jobs, parallel=self.parallel, **self.parallel._backend_args) return register_parallel_backend('CMFActivity', CMFActivityBackend)