def sample_parallel_proc():
    from pyrallel import mmap_utils, model_selection
    _ = reload(mmap_utils), reload(model_selection)


    from sklearn.datasets import load_digits
    from sklearn.preprocessing import MinMaxScaler

    digits = load_digits()

    X = MinMaxScaler().fit_transform(digits.data)
    y = digits.target

    digits_cv_split_filenames = mmap_utils.persist_cv_splits('digits_10', X, y, 10)

    mmap_utils.warm_mmap_on_cv_splits(client, digits_cv_split_filenames)
    from sklearn.svm import LinearSVC
    from collections import OrderedDict
    import numpy as np

    linear_svc_params = OrderedDict((
        ('C', np.logspace(-2, 2, 5)),
    ))
    linear_svc = LinearSVC()

    linear_svc_search = model_selection.RandomizedGridSeach(lb_view)

    linear_svc_search.launch_for_splits(linear_svc, linear_svc_params, digits_cv_split_filenames)
Exemple #2
0
def main():
    client = Client()
    print 'n. clients: ', len(client)

    digits = load_digits()

    X = MinMaxScaler().fit_transform(digits.data)
    y = digits.target

    pre_processing = hp.choice('preproc_algo', [
        scope.PCA(
            n_components=1 + hp.qlognormal(
                'pca_n_comp', np.log(10), np.log(10), 1),
            whiten=hp.choice(
                'pca_whiten', [False, True])),
        scope.GMM(
            n_components=1 + hp.qlognormal(
                'gmm_n_comp', np.log(100), np.log(10), 1),
            covariance_type=hp.choice(
                'gmm_covtype', ['spherical', 'tied', 'diag', 'full'])),
        ])

    classifier = hp.choice('classifier', [
        scope.DecisionTreeClassifier(
            criterion=hp.choice('dtree_criterion', ['gini', 'entropy']),
            max_features=hp.uniform('dtree_max_features', 0, 1),
            max_depth=hp.quniform('dtree_max_depth', 1, 25, 1)),
        scope.SVC(
            C=hp.lognormal('svc_rbf_C', 0, 3),
            kernel='rbf',
            gamma=hp.lognormal('svc_rbf_gamma', 0, 2),
            tol=hp.lognormal('svc_rbf_tol', np.log(1e-3), 1)),
        ])

    sklearn_space = {'pre_processing': pre_processing,
                     'classifier': classifier}

    digits_cv_split_filenames = mmap_utils.persist_cv_splits(
                X, y, name='digits_10', n_cv_iter=10)

    mmap_utils.warm_mmap_on_cv_splits(client, digits_cv_split_filenames)

    trials = hyperselect.IPythonTrials(client)
    trials.fmin(
        partial(compute_evaluation,
            cv_split_filename=digits_cv_split_filenames[0],
            ),
        sklearn_space,
        algo=hyperopt.tpe.suggest,
        max_evals=30,
        verbose=1,
        )
    trials.wait()
    print trials.best_trial
    def launch_for_splits(self,
                          model,
                          parameter_grid,
                          cv_split_filenames,
                          pre_warm=True,
                          collect_files_on_reset=False):
        """Launch a Grid Search on precomputed CV splits."""

        # Abort any existing processing and erase previous state
        self.reset()
        self.parameter_grid = parameter_grid

        # Mark the files for garbage collection
        if collect_files_on_reset:
            self._temp_files.extend(cv_split_filenames)

        # Warm the OS disk cache on each host with sequential reads instead
        # of having concurrent evaluation tasks compete for the the same host
        # disk resources later.
        if pre_warm:
            warm_mmap_on_cv_splits(self.lb_view.client, cv_split_filenames)

        # Randomize the grid order
        random_state = check_random_state(self.random_state)
        self.all_parameters = list(ParameterGrid(parameter_grid))
        random_state.shuffle(self.all_parameters)

        for params in self.all_parameters:
            task_group = []

            for cv_split_filename in cv_split_filenames:
                task = self.lb_view.apply(compute_evaluation,
                                          model,
                                          cv_split_filename,
                                          params=params)
                task_group.append(task)

            self.task_groups.append(task_group)

        # Make it possible to chain method calls
        return self
Exemple #4
0
    def launch_for_splits(self, model, parameter_grid, cv_split_filenames,
                          pre_warm=True, collect_files_on_reset=False):
        """Launch a Grid Search on precomputed CV splits."""

        # Abort any existing processing and erase previous state
        self.reset()
        self.parameter_grid = parameter_grid

        # Mark the files for garbage collection
        if collect_files_on_reset:
            self._temp_files.extend(cv_split_filenames)

        # Warm the OS disk cache on each host with sequential reads instead
        # of having concurrent evaluation tasks compete for the the same host
        # disk resources later.
        if pre_warm:
            warm_mmap_on_cv_splits(self.lb_view.client, cv_split_filenames)

        # Randomize the grid order
        random_state = check_random_state(self.random_state)
        self.all_parameters = list(ParameterGrid(parameter_grid))
        random_state.shuffle(self.all_parameters)

        for params in self.all_parameters:
            task_group = []

            for cv_split_filename in cv_split_filenames:
                task = self.lb_view.apply(
                    compute_evaluation,
                    model, cv_split_filename, gini=gini, params=params)
                task_group.append(task)

            self.task_groups.append(task_group)

        # Make it possible to chain method calls
        return self