Exemple #1
0
class PBIL(object):
    train_scalars = {
        'fitness_mean':
        lambda population, last, overall: np.mean(
            [x.fitness for x in population]),
        'fitness_last':
        lambda population, last, overall: last.fitness,
        'fitness_overall':
        lambda population, last, overall: overall.fitness
    }

    def __init__(self,
                 resources_path,
                 train_data,
                 lr=0.7,
                 selection_share=0.5,
                 n_generations=200,
                 n_individuals=75,
                 log_path=None):
        """
        Initializes a new instance of PBIL ensemble learning classifier.
        All PBIL hyper-parameters default to the values presented in the paper

        Cagnini, Henry E.L., Freitas, Alex A., Barros, Rodrigo C.
        An Evolutionary Algorithm for Learning Interpretable Ensembles of Classifiers.
        Brazilian Conference on Intelligent Systems. 2020.

        :param resources_path: Path to folder where at least two files must exist: classifiers.json and variables.json
        :type resources_path: str
        :param train_data: Training data as an object from the python weka wrapper library
        :type train_data: weka.core.dataset.Instances
        :param lr: Learning rate
        :type lr: float
        :param selection_share: How many individuals from general population with best fitness will update
            graphical models' probabilities.
        :type selection_share: float
        :param n_generations: Number of generations to run PBIL
        :type n_generations: int
        :param n_individuals: Number of individuals (solutions) to use
        :type n_individuals: int
        :param log_path: Optional: path to where metadata from this run should be stored.
        :type log_path: str
        """

        self.lr = lr  # type: float
        self.selection_share = selection_share  # type: float
        self.n_generations = n_generations  # type: int
        self.n_individuals = n_individuals  # type: int

        clfs = [x[0] for x in inspect.getmembers(generation, inspect.isclass)]
        classifier_names = [
            x for x in clfs
            if ClassifierWrapper in eval('generation.%s' % x).__bases__
        ]

        self.classifier_names = classifier_names  # type: list
        self.variables = json.load(
            open(os.path.join(resources_path, 'variables.json'),
                 'r'))  # type: dict
        self.classifier_data = json.load(
            open(os.path.join(resources_path, 'classifiers.json'),
                 'r'))  # type: dict
        self.train_data = train_data  # type: Instances
        self.n_classes = len(self.train_data.class_attribute.values)

        self.evaluator = EDAEvaluator(n_folds=5, train_data=self.train_data)

        self.n_generation = 0

        self._hof = None

        scalars = copy.deepcopy(self.train_scalars)

        if log_path is not None:
            self.logger = PBILLogger(logdir_path=log_path,
                                     histogram_names=['fitness'],
                                     scalars=scalars,
                                     text_names=['last', 'overall'])
            self.logger.log_probabilities(
                variables=self.variables)  # register first probabilities
        else:
            self.logger = None

    def sample_and_evaluate(self, seed, n_individuals):
        """
        Samples new individuals from graphical model.

        :param seed: seed used to partition training set at every generation. The (sub)sets will be constant throughout
        all the evolutionary process, allowing a direct comparison between individuals from different generations.
        :type seed: int
        :param n_individuals: Number of individuals to sample.
        :type n_individuals: int
        :return: the recently sampled population
        :rtype: list
        """

        len_hall = len(self._hof)

        if len_hall == 0:
            parameters = {k: [] for k in self.classifier_names}
            parameters['Aggregator'] = []
            ilogs = []
        else:
            parameters = {
                k: [x.options[k] for x in self._hof]
                for k in self.classifier_names
            }
            parameters['Aggregator'] = [
                x.options['Aggregator'] for x in self._hof
            ]
            ilogs = [x.log for x in self._hof]
            self._hof.clear()

        for j in range(n_individuals):
            ilog = dict()

            for classifier_name in self.classifier_names:
                ilog[classifier_name] = np.random.choice(
                    a=self.variables[classifier_name]['params']['a'],
                    p=self.variables[classifier_name]['params']['p'])
                if ilog[classifier_name]:  # whether to include this classifier in the ensemble
                    options, cclog = eval(classifier_name).sample_options(
                        variables=self.variables,
                        classifiers=self.classifier_data)

                    ilog.update(cclog)
                    parameters[classifier_name] += [options]
                else:
                    parameters[classifier_name].append([])

            ilog['Aggregator'] = np.random.choice(
                a=self.variables['Aggregator']['params']['a'],
                p=self.variables['Aggregator']['params']['p'])
            agg_options, alog = eval(
                ilog['Aggregator']).sample_options(variables=self.variables)
            ilog.update(alog)

            parameters['Aggregator'] += [[ilog['Aggregator']] + agg_options]

            ilogs += [ilog]

        train_aucs = self.evaluator.get_unweighted_aucs(seed=seed,
                                                        parameters=parameters)

        # hall of fame is put in the front
        for i in range(0, len_hall):
            local_options = {
                k: parameters[k][i]
                for k in self.classifier_names
            }
            local_options['Aggregator'] = parameters['Aggregator'][i]
            self._hof.insert(
                Skeleton(seed=seed,
                         log=ilogs[i],
                         options=local_options,
                         fitness=train_aucs[i]))
        population = []
        for i in range(len_hall, n_individuals + len_hall):
            local_options = {
                k: parameters[k][i]
                for k in self.classifier_names
            }
            local_options['Aggregator'] = parameters['Aggregator'][i]
            population += [
                Skeleton(seed=seed,
                         log=ilogs[i],
                         options=local_options,
                         fitness=train_aucs[i])
            ]

        return population

    def update(self, population):
        """
        Updates graphical model probabilities based on the fittest population.

        :param population: All population from a given generation.
        :type population: list
        """

        if self.logger is not None:
            self.logger.log_probabilities(variables=self.variables)
            self.logger.log_population(population=population,
                                       halloffame=self._hof)

        # selects fittest individuals
        _sorted = sorted(zip(population, [ind.fitness for ind in population]),
                         key=lambda x: x[1],
                         reverse=True)
        population, fitnesses = zip(*_sorted)
        fittest = population[:int(len(population) * self.selection_share)]
        observations = pd.DataFrame([fit.log for fit in fittest])

        # update classifiers probabilities
        for variable_name, variable_data in self.variables.items():
            self.variables[variable_name] = process_update(
                ptype=variable_data['ptype'],
                variable_name=variable_name,
                variable_data=variable_data,
                observations=observations,
                lr=self.lr,
                n_generations=self.n_generations)

        self.n_generation += 1

    def run(self, seed):
        """
        Trains this classifier.

        :param seed: seed used to partition training set at every generation. The (sub)sets will be constant throughout
        all the evolutionary process, allowing a direct comparison between individuals from different generations.
        :type seed: int
        :rtype: tuple
        :return: a tuple containing two individuals.Individual objects, where the first individual is the best solution
        (according to fitness) found throughout all the evolutionary process, and the second individual the best solution
        from the last generation.
        """

        # Statistics computation
        stats = tools.Statistics(lambda ind: ind.fitness)
        for stat_name, stat_func in PBILLogger.population_operators:
            stats.register(stat_name, stat_func)

        # An object that keeps track of the best individual found so far.
        self._hof = HallOfFame(maxsize=1)  # type: HallOfFame

        best_last, logbook = self.__run__(seed=seed,
                                          ngen=self.n_generations,
                                          stats=stats,
                                          verbose=True)

        best_overall = self._hof[0]  # type: Individual
        self._hof = None

        gc.collect()

        return best_overall, best_last

    def __run__(self, seed, ngen, stats=None, verbose=__debug__):
        """
        Do not use this method.
        """

        logbook = tools.Logbook()
        logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])

        early = EarlyStop()

        population = []
        for gen in range(ngen):
            # early stop
            if early.is_stopping():
                break

            # Generate a new population, already evaluated; re-evaluates halloffame with new seed
            population = self.sample_and_evaluate(
                seed=seed, n_individuals=self.n_individuals)

            self._hof.update(population)

            # Update the strategy with the evaluated individuals
            self.update(population=population)

            record = stats.compile(population) if stats is not None else {}
            logbook.record(gen=gen, nevals=len(population), **record)
            if verbose:
                print(logbook.stream)

            early.update(halloffame=self._hof, gen=gen)

        fitnesses = [ind.fitness for ind in population]

        best_skeleton = population[int(np.argmax(fitnesses))]  # type: Skeleton
        best_last = Individual(seed=seed,
                               log=best_skeleton.log,
                               options=best_skeleton.options,
                               train_data=self.train_data)

        skts = [self._hof[i] for i in range(len(self._hof))]
        self._hof.clear()
        for i in range(len(skts)):
            ind = Individual(seed=seed,
                             log=skts[i].log,
                             options=skts[i].options,
                             train_data=self.train_data)
            self._hof.insert(ind)

        return best_last, logbook