def gridSearch(options, use_datasets, numExamples, compute_mistakes=False, verbose=False, parallelize=False):
    if MODEL_KEYWORD not in options:
        print 'ERROR: must specify models for grid search under "%s" key.' % (MODEL_KEYWORD)
        return
    paramCombos = myProduct(options)
    partialTestCombo = partial(testCombo, use_datasets=use_datasets, numExamples=numExamples, compute_mistakes=compute_mistakes, verbose=verbose)
    if parallelize:
        from pathos.multiprocessing import Pool
        p = Pool(5)
        try:
            result = p.map_async(partialTestCombo, paramCombos)
            result = result.get(999999999)
            bestScore, bestParamsStr, bestCombo = max(result, key=lambda x:x[0])
            sys.stdout = open("best.out", "w")
            print 'Best score of %s was achieved by parameters:\n%s' % (bestScore, bestParamsStr)
        except KeyboardInterrupt:
            p.terminate()
            print "You cancelled the program!"
            sys.exit(1)
    else:
        bestScore, bestCombo, bestComboStr = float('-inf'), None, ''
        for paramCombo in paramCombos:
            score, paramsStr, _ = testCombo(paramCombo, use_datasets=use_datasets, numExamples=numExamples, compute_mistakes=compute_mistakes, verbose=verbose, parallelize=False)
            if score > bestScore:
                bestScore, bestCombo, bestComboStr = score, paramCombo, paramsStr
        print 'Best score of %s was achieved by parameters:\n%s' % (bestScore, bestComboStr)
Beispiel #2
0
    def create_initial_population(self):
        """Create members of the first population randomly."""

        for _ in range(self.pop_size):
            individual = Particle(self.chromosome_size, self.fitness_function)
            if not self.pool:
                individual.calculate_fitness()
            self.add_individual_to_pop(individual)

        if self.pool:
            p = Pool(self.pool_size)
            manager = Manager()
            lock = manager.Lock()
            counter = manager.Value('i', 0)

            def pool_function(inside_lock, inside_counter, inside_member):
                inside_lock.acquire()
                inside_counter.value += 1
                inside_lock.release()

                fitness_value = inside_member.calculate_fitness(
                    gpu=inside_counter.value % 4)

                return fitness_value

            func = partial(pool_function, lock, counter)
            fitness_values = p.map(func, self.current_population[:])

            for value, member in zip(fitness_values,
                                     self.current_population[:]):
                member.fitness = value

            p.terminate()
Beispiel #3
0
 def run_mcmc(self):
     complete = self.chains.check_completness()
     if not complete:
         for i in range(self.nchains):
             self.chains.chains.append(
                 Chain(self.chains.chains_filename, self.covfile, nchain=i, nsteps=self.nsteps))
         pool = Pool(processes=self.nchains)
         try:
             # Without the .get(9999), you can't interrupt this with Ctrl+C.
             pool.map_async(self.mcmc, self.chains.chains).get(999999)
             pool.close()
             pool.join()
             # to skip lines after the progress bars
             print '\n' * self.nchains
         except KeyboardInterrupt:
             pool.terminate()
     self.likelihood = self.chains.chains_to_likelihood()
     self.likelihood.stats(self.covfile)
     # [self.results[i].append(self.likelihood.pdfs[i].mean) for i in range(self.chains.dim)]
     # self.p = [self.likelihood.pdfs[i].mean for i in range(self.chains.dim)]
     self.p = self.chains.best_row_params
     self.simulation(self.spectrum.lambdas, *self.p)
     # [self.results_err[i].append([self.likelihood.pdfs[i].error_high,self.likelihood.pdfs[i].error_low]) for i in range(self.chains.dim)]
     # if(self.plot):
     self.likelihood.triangle_plots()
     self.plot_fit()
     # if convergence_test :
     self.chains.convergence_tests()
     return self.likelihood
    def vw_train_and_test(self, options_list, data_file_paths):
        def init_worker():
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        def run_learner(options):
            train_options = {
                'final_regressor': os.path.join(self.work_dir, id_generator()),
                'data': data_file_paths['train'],
                'cache_file': data_file_paths['train'] + '.cache'
            }
            train_options.update(options)
            test_options = {
                'data': data_file_paths['test'],
                'predictions': os.path.join(self.work_dir, id_generator()),
                'cache_file': data_file_paths['test'] + '.cache'
            }
            test_options.update(options)

            # TO DO: remove below if.
            if 'kill_cache' in options:
                del train_options['kill_cache']
                del test_options['kill_cache']

            vw_wrapper = VW_Wrapper(verbose=False)
            vw_wrapper.train(train_options)
            predictions = vw_wrapper.test(test_options)
            os.remove(train_options['final_regressor'])
            os.remove(test_options['predictions'])
            return options, predictions

        if len(options_list) > 1:
            try:
                if not os.path.isfile(data_file_paths['test'] +
                                      '.cache') or not os.path.isfile(
                                          data_file_paths['train'] + '.cache'):
                    run_learner(options_list[0])
                pool = Pool(len(options_list), init_worker)
                result_list = pool.map_async(run_learner,
                                             options_list).get(99999999)
                pool.close()
                pool.join()
                return result_list
            except KeyboardInterrupt:
                print '  Keyboard Interrupt, exiting...) '
                pool.terminate()
                pool.join()
                sys.exit(0)

        elif len(options_list) == 1:
            return [run_learner(options_list[0])]
        else:
            return []
Beispiel #5
0
def gridSearch(options,
               use_datasets,
               numExamples,
               compute_mistakes=False,
               verbose=False,
               parallelize=False):
    if MODEL_KEYWORD not in options:
        print 'ERROR: must specify models for grid search under "%s" key.' % (
            MODEL_KEYWORD)
        return
    paramCombos = myProduct(options)
    partialTestCombo = partial(testCombo,
                               use_datasets=use_datasets,
                               numExamples=numExamples,
                               compute_mistakes=compute_mistakes,
                               verbose=verbose)
    if parallelize:
        from pathos.multiprocessing import Pool
        p = Pool(5)
        try:
            result = p.map_async(partialTestCombo, paramCombos)
            result = result.get(999999999)
            bestScore, bestParamsStr, bestCombo = max(result,
                                                      key=lambda x: x[0])
            sys.stdout = open("best.out", "w")
            print 'Best score of %s was achieved by parameters:\n%s' % (
                bestScore, bestParamsStr)
        except KeyboardInterrupt:
            p.terminate()
            print "You cancelled the program!"
            sys.exit(1)
    else:
        bestScore, bestCombo, bestComboStr = float('-inf'), None, ''
        for paramCombo in paramCombos:
            score, paramsStr, _ = testCombo(paramCombo,
                                            use_datasets=use_datasets,
                                            numExamples=numExamples,
                                            compute_mistakes=compute_mistakes,
                                            verbose=verbose,
                                            parallelize=False)
            if score > bestScore:
                bestScore, bestCombo, bestComboStr = score, paramCombo, paramsStr
        print 'Best score of %s was achieved by parameters:\n%s' % (
            bestScore, bestComboStr)
Beispiel #6
0
    def partial_dependence(self, feature_ids, modelinstance, filter_classes=None, grid=None,
                           grid_resolution=30, n_jobs=-1, grid_range=None, sample=True,
                           sampling_strategy='random-choice', n_samples=1000,
                           bin_count=50, samples_per_bin=10, return_metadata=False):

        """
        Approximates the partial dependence of the predict_fn with respect to the
        variables passed.

        Parameters:
        -----------
        feature_ids: list
            the names/ids of the features for which partial dependence is to be computed.
            Note that the algorithm's complexity scales exponentially with additional
            features, so generally one should only look at one or two features at a
            time. These feature ids must be available in the class's associated DataSet.
            As of now, we only support looking at 1 or 2 features at a time.
        modelinstance: skater.model.model.Model subtype
            an estimator function of a fitted model used to derive prediction. Supports
            classification and regression. Supports classification(binary, multi-class) and regression.
            predictions = predict_fn(data)

            Can either by a skater.model.remote.DeployedModel or a
            skater.model.local.InMemoryModel
        filter_classes: array type
            The classes to run partial dependence on. Default None invokes all classes.
            Only used in classification models.
        grid: numpy.ndarray
            2 dimensional array on which we fix values of features. Note this is
            determined automatically if not given based on the percentiles of the
            dataset.
        grid_resolution: int
            how many unique values to include in the grid. If the percentile range
            is 5% to 95%, then that range will be cut into <grid_resolution>
            equally size bins. Defaults to 30.
        n_jobs: int
            The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
            Defaults to using all cores(-1).
        grid_range: tuple
            the percentile extrama to consider. 2 element tuple, increasing, bounded
            between 0 and 1.
        sample: boolean
            Whether to sample from the original dataset.
        sampling_strategy: string
            If sampling, which approach to take. See DataSet.generate_sample for
            details.
        n_samples: int
            The number of samples to use from the original dataset. Note this is
            only active if sample = True and sampling strategy = 'uniform'. If
            using 'uniform-over-similarity-ranks', use samples per bin
        bin_count: int
            The number of bins to use when using the similarity based sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'.
            total samples = bin_count * samples per bin.
        samples_per_bin: int
            The number of samples to collect for each bin within the sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'. If using
            sampling_strategy = 'uniform', use n_samples.
            total samples = bin_count * samples per bin.
        return_metadata: boolean

        :Example:
        >>> from skater.model import InMemoryModel
        >>> from skater.core.explanations import Interpretation
        >>> from sklearn.ensemble import RandomForestClassier
        >>> from sklearn.datasets import load_boston
        >>> boston = load_boston()
        >>> X = boston.data
        >>> y = boston.target
        >>> features = boston.feature_names

        >>> rf = RandomForestClassier()
        >>> rf.fit(X,y)


        >>> model = InMemoryModel(rf, examples = X)
        >>> interpreter = Interpretation()
        >>> interpreter.load_data(X)
        >>> feature_ids = ['ZN','CRIM']
        >>> interpreter.partial_dependence.partial_dependence(features,model)
        """

        if self.data_set is None:
            load_data_not_called_err_msg = "self.interpreter.data_set not found. " \
                                           "Please call Interpretation.load_data " \
                                           "before running this method."
            raise(exceptions.DataSetNotLoadedError(load_data_not_called_err_msg))

        feature_ids = self._check_features(feature_ids)

        if filter_classes:
            err_msg = "members of filter classes must be" \
                      "members of modelinstance.classes." \
                      "Expected members of: " \
                      "{0}\n" \
                      "got: " \
                      "{1}".format(modelinstance.target_names,
                                   filter_classes)
            assert all([i in modelinstance.target_names for i in filter_classes]), err_msg

        # TODO: There might be a better place to do this check
        if not isinstance(modelinstance, ModelType):
            raise(exceptions.ModelError("Incorrect estimator function used for computing partial dependence, try one "
                                        "creating one with skater.model.local.InMemoryModel or"
                                        "skater.model.remote.DeployedModel"))

        if modelinstance.model_type == 'classifier' and modelinstance.probability is False:

            if modelinstance.unique_values is None:
                raise(exceptions.ModelError('If using classifier without probability scores, unique_values cannot '
                                            'be None'))
            self.interpreter.logger.warn("Classifiers with probability scores can be explained "
                                         "more granularly than those without scores. If a prediction method with "
                                         "scores is available, use that instead.")

        # TODO: This we can change easily to functional style
        missing_feature_ids = []
        for feature_id in feature_ids:
            if feature_id not in self.data_set.feature_ids:
                missing_feature_ids.append(feature_id)

        if missing_feature_ids:
            missing_feature_id_err_msg = "Features {0} not found in " \
                                         "Interpretation.data_set.feature_ids" \
                                         "{1}".format(missing_feature_ids, self.data_set.feature_ids)
            raise(KeyError(missing_feature_id_err_msg))

        if grid_range is None:
            grid_range = (.05, 0.95)
        else:
            if not hasattr(grid_range, "__iter__"):
                err_msg = "Grid range {} needs to be an iterable".format(grid_range)
                raise(exceptions.MalformedGridRangeError(err_msg))

        self._check_grid_range(grid_range)

        if not modelinstance.has_metadata:
            examples = self.data_set.generate_sample(strategy='random-choice',
                                                     sample=True,
                                                     n_samples_from_dataset=10)
            examples = DataManager(examples, feature_names=self.data_set.feature_ids)
            modelinstance._build_model_metadata(examples)

        # if you dont pass a grid, build one.
        grid = np.array(grid)
        if not grid.any():
            # Currently, if a given feature has fewer unique values than the value
            # of grid resolution, then the grid will be set to those unique values.
            # Otherwise it will take the percentile
            # range according with grid_resolution bins.
            grid = self.data_set.generate_grid(feature_ids,
                                               grid_resolution=grid_resolution,
                                               grid_range=grid_range)
        else:
            # want to ensure all grids have 2 axes
            if len(grid.shape) == 1 and \
                    (StaticTypes.data_types.is_string(grid[0]) or StaticTypes.data_types.is_numeric(grid[0])):
                grid = grid[:, np.newaxis].T
                grid_resolution = grid.shape[1]

        self.interpreter.logger.debug("Grid shape used for pdp: {}".format(grid.shape))
        self.interpreter.logger.debug("Grid resolution for pdp: {}".format(grid_resolution))

        # make sure data_set module is giving us correct data structure
        self._check_grid(grid, feature_ids)

        # generate data
        data_sample = self.data_set.generate_sample(strategy=sampling_strategy,
                                                    sample=sample,
                                                    n_samples_from_dataset=n_samples,
                                                    samples_per_bin=samples_per_bin,
                                                    bin_count=bin_count)

        _pdp_metadata = self._build_metadata_dict(modelinstance, feature_ids, self.data_set.feature_ids, filter_classes)

        self.interpreter.logger.debug("Shape of sampled data: {}".format(data_sample.shape))
        self.interpreter.logger.debug("Feature Ids: {}".format(feature_ids))
        self.interpreter.logger.debug("PD metadata: {}".format(_pdp_metadata))

        # cartesian product of grid
        grid_expanded = pd.DataFrame(list(product(*grid))).values

        if grid_expanded.shape[0] <= 0:
            empty_grid_expanded_err_msg = "Must have at least 1 pdp value" \
                                          "grid shape: {}".format(grid_expanded.shape)
            raise(exceptions.MalformedGridError(empty_grid_expanded_err_msg))

        predict_fn = modelinstance._get_static_predictor()

        n_jobs = None if n_jobs < 0 else n_jobs
        pd_func = functools.partial(_compute_pd,
                                    estimator_fn=predict_fn,
                                    grid_expanded=grid_expanded,
                                    pd_metadata=_pdp_metadata,
                                    input_data=data_sample,
                                    filter_classes=filter_classes)
        arg_list = [i for i in range(grid_expanded.shape[0])]
        executor_instance = Pool(n_jobs)
        try:
            pd_list = executor_instance.map(pd_func, arg_list)
        except:
            self.interpreter.logger.debug("Multiprocessing failed, going single process")
            pd_list = map(pd_func, arg_list)
        finally:
            executor_instance.close()
            executor_instance.join()
            executor_instance.terminate()
        if return_metadata:
            return pd.DataFrame(list(pd_list)), _pdp_metadata
        else:
            return pd.DataFrame(list(pd_list))
def bayesian_optimisation(patience=-1,
                          n_iters=-1,
                          sample_loss=None,
                          bounds=None,
                          x0=None,
                          n_pre_samples=5,
                          gp_params=None,
                          random_search=False,
                          alpha=1e-5,
                          epsilon=1e-7,
                          pool_size=1):
    """ bayesian_optimisation
    Uses Gaussian Processes to optimise the loss function `sample_loss`.
    Arguments:
    ----------
        patience: integer
            Number of nonimproved iterations before exit.
        n_iters: integer.
            Number of iterations to run the search algorithm.
        sample_loss: function.
            Function to be optimised.
        bounds: array-like, shape = [n_params, 2].
            Lower and upper bounds on the parameters of the function `sample_loss`.
        x0: array-like, shape = [n_pre_samples, n_params].
            Array of initial points to sample the loss function for. If None, randomly
            samples from the loss function.
        n_pre_samples: integer.
            If x0 is None, samples `n_pre_samples` initial points from the loss function.
        gp_params: dictionary.
            Dictionary of parameters to pass on to the underlying Gaussian Process.
        random_search: integer.
            Flag that indicates whether to perform random search or L-BFGS-B optimisation
            over the acquisition function.
        alpha: double.
            Variance of the error term of the GP.
        epsilon: double.
            Precision tolerance for floats.
    """

    x_list = []
    y_list = []

    iteration = 0
    no_improvement = 0
    best_fitness = 100

    n_params = bounds.shape[0]
    print("Create {} initial points".format(n_pre_samples))

    if x0 is None:
        for params in np.random.uniform(bounds[:, 0], bounds[:, 1],
                                        (n_pre_samples, bounds.shape[0])):
            x_list.append(params)

        if pool_size > 1:
            p = Pool(pool_size)
            losses = p.map(sample_loss, x_list)
            y_list = losses
            p.terminate()
        else:
            for params in x_list:
                y_list.append(sample_loss(params))
    else:
        for params in x0:
            x_list.append(params)

        if pool_size > 1:
            p = Pool(pool_size)
            losses = p.map(sample_loss, x_list)
            y_list = losses
            p.terminate()
        else:
            for params in x_list:
                y_list.append(sample_loss(params))

    xp = np.array(x_list)
    yp = np.array(y_list)

    # Create the GP
    if gp_params is not None:
        model = gp.GaussianProcessRegressor(**gp_params)
    else:
        kernel = gp.kernels.Matern()
        model = gp.GaussianProcessRegressor(kernel=kernel,
                                            alpha=alpha,
                                            n_restarts_optimizer=10,
                                            normalize_y=True)

    print("Use bayesian optimization to sample points.")
    while True:
        print("Iteration: {}".format(iteration + 1))
        model.fit(xp, yp)

        # Sample next hyperparameter
        if random_search:
            x_random = np.random.uniform(bounds[:, 0],
                                         bounds[:, 1],
                                         size=(random_search, n_params))
            ei = -1 * expected_improvement(x_random,
                                           model,
                                           yp,
                                           greater_is_better=False,
                                           n_params=n_params)
            next_sample = x_random[np.argmax(ei), :]
        else:
            next_sample = sample_next_hyperparameter(expected_improvement,
                                                     model,
                                                     yp,
                                                     greater_is_better=False,
                                                     bounds=bounds,
                                                     n_restarts=100)

        # Duplicates will break the GP. In case of a duplicate, we will randomly sample a next query point.
        if np.any(np.abs(next_sample - xp) <= epsilon):
            next_sample = np.random.uniform(bounds[:, 0], bounds[:, 1],
                                            bounds.shape[0])

        # Sample loss for new set of parameters
        cv_score = sample_loss(next_sample)

        # Update lists
        x_list.append(next_sample)
        y_list.append(cv_score)

        # Update xp and yp
        xp = np.array(x_list)
        yp = np.array(y_list)

        if cv_score < best_fitness:
            best_fitness = cv_score
            no_improvement = 0
        else:
            no_improvement += 1

        iteration += 1

        if iteration == n_iters or patience == no_improvement:
            break

    return xp, yp
Beispiel #8
0
    def add_new_individuals_function(self, name):
        """
        Add new individuals to the population with a given method
        (crossover, differential evolution, invasive weed, add pure individual).
        """
        start = time.time()

        if name == "Crossover":
            current_function = partial(self.population.crossover, self.selection_function)
            name = name[:9] + ' ' + self.config["selection_type"]
            iterator = range(self.num_of_crossover)
        elif name == "Differential evolution":
            current_function = partial(self.population.differential_evolution, self.config["CR"], self.config["F"])
            iterator = range(len(self.population))
        elif name == "Invasive weed":
            current_function = partial(self.population.invasive_weed, self.iteration, self.config["iter_max"],
                                       self.config["e"], self.config["sigma_init"], self.config["sigma_fin"],
                                       self.config["N_min"], self.config["N_max"])
            iterator = self.population[:]
        elif name == "Add pure new":
            current_function = self.population.add_new_individual
            iterator = range(self.num_of_new_individual)
        else:
            raise NameError("Bad type of function.")
        if self.pool:
            p = Pool(self.pool_size)
            manager = Manager()
            lock = manager.Lock()
            counter = manager.Value('i', 0)

            def pool_function(inside_lock, inside_counter, inside_member):
                inside_lock.acquire()
                inside_counter.value += 1
                inside_lock.release()

                inside_members = current_function(inside_member, gpu=inside_counter.value % 4)
                return inside_members

            func = partial(pool_function, lock, counter)

            members = p.map(func, iterator)

            if name == "Differential evolution":
                self.population.current_population = members
            else:
                try:
                    members = sum(members, [])
                except TypeError:
                    pass

                for member in members:
                    self.population.add_individual_to_pop(member)
            p.terminate()
        else:
            members = []
            for argument in iterator:
                member = current_function(argument, gpu=0)
                members.append(member)

            if name == "Differential evolution":
                self.population.current_population = members
            else:
                try:
                    members = sum(members, [])
                except TypeError:
                    pass

                for member in members:
                    self.population.add_individual_to_pop(member)

        step_time = time.time() - start

        if step_time < 120:
            print('{0} time: {1:.2f}s\n'.format(name, step_time))
        else:
            print('{0} time: {1:.2f}min\n'.format(name, step_time // 60))

        return step_time, name
Beispiel #9
0
    def modify_one_by_one_function(self, name):
        """Apply a function (local search, mutation) to all chromosomes."""
        start = time.time()
        if self.progress_bar:
            print("{}:".format(name))

        if name == "Local search":
            current_function = self.memetic_function
        elif name == "Mutation":
            current_function = self.mutation_function
            name = name[:8] + ' ' + self.config["mutation_type"]
        else:
            raise NameError("Bad type of function.")

        if self.iteration > 1:
            if name in self.logs[-2].keys():
                if self.logs[-2][name]["step_time"] < 4:
                    self.progress_bar = False
                else:
                    self.progress_bar = True

        if self.fitness_function.name in ["fully connected", "convnet"]:
            self.progress_bar = False

        if self.pool:
            p = Pool(self.pool_size)
            manager = Manager()
            lock = manager.Lock()
            counter = manager.Value('i', 0)
            if self.progress_bar:
                pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], term_width=60, maxval=len(self.population)).start()
            else:
                pbar = None

            def pool_function(inside_lock, inside_counter, inside_member):
                inside_lock.acquire()
                inside_counter.value += 1
                inside_lock.release()

                inside_member.apply_on_chromosome(current_function, gpu=inside_counter.value % 4)

                inside_lock.acquire()
                if pbar:
                    pbar.update(inside_counter.value)
                inside_lock.release()

                return inside_member

            func = partial(pool_function, lock, counter)
            first = 1 if self.elitism and name == "Mutation" else 0

            members = p.map(func, self.population[first:])

            if self.elitism and name == "Mutation":
                members.append(self.population[0])

            self.population.current_population = members
            p.terminate()
        else:
            if self.progress_bar:
                pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], term_width=60, maxval=len(self.population)).start()
            ignor_first = self.elitism and name == "Mutation"

            for i, member in enumerate(self.population):
                if self.progress_bar:
                    pbar.update(i + 1)
                if not ignor_first:
                    member.apply_on_chromosome(current_function)
                ignor_first = False

        if self.progress_bar:
            pbar.finish()

        step_time = time.time() - start

        if step_time < 120:
            print('{0} time: {1:.2f}s\n'.format(name, step_time))
        else:
            print('{0} time: {1:.2f}min\n'.format(name, step_time // 60))

        return step_time, name