def sample_parallel_proc(): from pyrallel import mmap_utils, model_selection _ = reload(mmap_utils), reload(model_selection) from sklearn.datasets import load_digits from sklearn.preprocessing import MinMaxScaler digits = load_digits() X = MinMaxScaler().fit_transform(digits.data) y = digits.target digits_cv_split_filenames = mmap_utils.persist_cv_splits('digits_10', X, y, 10) mmap_utils.warm_mmap_on_cv_splits(client, digits_cv_split_filenames) from sklearn.svm import LinearSVC from collections import OrderedDict import numpy as np linear_svc_params = OrderedDict(( ('C', np.logspace(-2, 2, 5)), )) linear_svc = LinearSVC() linear_svc_search = model_selection.RandomizedGridSeach(lb_view) linear_svc_search.launch_for_splits(linear_svc, linear_svc_params, digits_cv_split_filenames)
def main(): client = Client() print 'n. clients: ', len(client) digits = load_digits() X = MinMaxScaler().fit_transform(digits.data) y = digits.target pre_processing = hp.choice('preproc_algo', [ scope.PCA( n_components=1 + hp.qlognormal( 'pca_n_comp', np.log(10), np.log(10), 1), whiten=hp.choice( 'pca_whiten', [False, True])), scope.GMM( n_components=1 + hp.qlognormal( 'gmm_n_comp', np.log(100), np.log(10), 1), covariance_type=hp.choice( 'gmm_covtype', ['spherical', 'tied', 'diag', 'full'])), ]) classifier = hp.choice('classifier', [ scope.DecisionTreeClassifier( criterion=hp.choice('dtree_criterion', ['gini', 'entropy']), max_features=hp.uniform('dtree_max_features', 0, 1), max_depth=hp.quniform('dtree_max_depth', 1, 25, 1)), scope.SVC( C=hp.lognormal('svc_rbf_C', 0, 3), kernel='rbf', gamma=hp.lognormal('svc_rbf_gamma', 0, 2), tol=hp.lognormal('svc_rbf_tol', np.log(1e-3), 1)), ]) sklearn_space = {'pre_processing': pre_processing, 'classifier': classifier} digits_cv_split_filenames = mmap_utils.persist_cv_splits( X, y, name='digits_10', n_cv_iter=10) mmap_utils.warm_mmap_on_cv_splits(client, digits_cv_split_filenames) trials = hyperselect.IPythonTrials(client) trials.fmin( partial(compute_evaluation, cv_split_filename=digits_cv_split_filenames[0], ), sklearn_space, algo=hyperopt.tpe.suggest, max_evals=30, verbose=1, ) trials.wait() print trials.best_trial
def launch_for_splits(self, model, parameter_grid, cv_split_filenames, pre_warm=True, collect_files_on_reset=False): """Launch a Grid Search on precomputed CV splits.""" # Abort any existing processing and erase previous state self.reset() self.parameter_grid = parameter_grid # Mark the files for garbage collection if collect_files_on_reset: self._temp_files.extend(cv_split_filenames) # Warm the OS disk cache on each host with sequential reads instead # of having concurrent evaluation tasks compete for the the same host # disk resources later. if pre_warm: warm_mmap_on_cv_splits(self.lb_view.client, cv_split_filenames) # Randomize the grid order random_state = check_random_state(self.random_state) self.all_parameters = list(ParameterGrid(parameter_grid)) random_state.shuffle(self.all_parameters) for params in self.all_parameters: task_group = [] for cv_split_filename in cv_split_filenames: task = self.lb_view.apply(compute_evaluation, model, cv_split_filename, params=params) task_group.append(task) self.task_groups.append(task_group) # Make it possible to chain method calls return self
def launch_for_splits(self, model, parameter_grid, cv_split_filenames, pre_warm=True, collect_files_on_reset=False): """Launch a Grid Search on precomputed CV splits.""" # Abort any existing processing and erase previous state self.reset() self.parameter_grid = parameter_grid # Mark the files for garbage collection if collect_files_on_reset: self._temp_files.extend(cv_split_filenames) # Warm the OS disk cache on each host with sequential reads instead # of having concurrent evaluation tasks compete for the the same host # disk resources later. if pre_warm: warm_mmap_on_cv_splits(self.lb_view.client, cv_split_filenames) # Randomize the grid order random_state = check_random_state(self.random_state) self.all_parameters = list(ParameterGrid(parameter_grid)) random_state.shuffle(self.all_parameters) for params in self.all_parameters: task_group = [] for cv_split_filename in cv_split_filenames: task = self.lb_view.apply( compute_evaluation, model, cv_split_filename, gini=gini, params=params) task_group.append(task) self.task_groups.append(task_group) # Make it possible to chain method calls return self