Example #1
0
def test_incremental_validation(X=None, y=None, iterations=10, verbose=1):
    if not X:
        X, y = make_classification(n_samples=500,
                                   n_features=1000,
                                   n_informative=20,
                                   n_redundant=1,
                                   n_repeated=0,
                                   n_classes=2,
                                   n_clusters_per_class=2,
                                   weights=None,
                                   class_sep=1,
                                   hypercube=False,
                                   scale=1.0,
                                   shuffle=True,
                                   random_state=0)
    X //= 10  # --> To be able to evaluate categoricalNB

    # classifiers
    nb_classifier = NaiveBayes(encode_data=True)
    nb_classifier_no_encoding = NaiveBayes(encode_data=False)
    custom_encoder = CustomOrdinalFeatureEncoder()
    cnb = CategoricalNB()

    # accumulators
    categorical_nb = []
    custom_nb_val_1 = []
    custom_nb_val_2 = []
    custom_nb_val_3 = []
    custom_nb_val_4 = []
    for i in range(iterations):
        if verbose:
            print(f"Iteration {i}")
        ts = time()
        X2 = custom_encoder.fit_transform(X)

        ts = time()
        score_2 = nb_classifier.leave_one_out_cross_val(X, y)
        custom_nb_val_1.append(time() - ts)

        ts = time()
        score_4 = cross_leave_one_out(nb_classifier, X, y)
        custom_nb_val_3.append(time() - ts)

        ts = time()
        X2 = custom_encoder.fit_transform(X)
        score_5 = cross_leave_one_out(nb_classifier_no_encoding, X2, y)
        custom_nb_val_4.append(time() - ts)

        if i == 0:
            score_1 = score_2
            scores = [score_1, score_2, score_4, score_5]
            assert all(score == scores[0] for score in scores)
    print("Categorical with scikit loo: ", np.mean(categorical_nb[1:]))
    print("Custom with scikit loo: ", np.mean(custom_nb_val_3[1:]))
    print("Custom with scikit loo (pre-encoding): ",
          np.mean(custom_nb_val_4[1:]))
    print("Custom with first incremental: ", np.mean(custom_nb_val_1[1:]))
Example #2
0
class GeneticProgrammingFlexibleLogic(OptimizationMixin, TransformerMixin,
                                      ClassifierMixin, BaseEstimator):
    """GeneticProgramming for Feature Construction and Selection.


    Parameters
    ----------

    seed : int or None
        Seed to guarantee reproducibility

    individuals : int
        Number of individuals per population

    generations : int
        Number of generations 

    mutation_probability : float
        Probability for each individual of being mutated

    select : {rank,proportionate}
        Selection strategy

    mutation : {simple,complex}
        Mutation strategy

    combine : {truncation,elitism} 
        Population combination strategy

    n_intervals : int
        Number of intervals for the discretization of continous variables

    mixed : bool
        Mix heuristic and wrapper evaluation

    mixed_percentage : float
        Percentage of total iterations to do heuristic evaluation

    metric : {accuracy,f1-score}
        Target metric for the optimization process
    
    flexible_logic: bool
        Allow different individual sizes in the generation
    
    encode_data : bool, default=True
        Encode data when data is not encoded by default with an OrdinalEncoder
    
    verbose :int {0,1}, default = 1 
        Display process progress


    Attributes
    ----------
    classifier_ : NaiveBayes
        Base classifier used for prediction

    best_features_ : array-lik of Feature
        Array of selected Feature used for transforming new data
    """
    def simple_evaluate(self, individual, X, y):
        classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
        return classifier_.leave_one_out_cross_val(transform_features(
            individual[0] + individual[1], X),
                                                   y,
                                                   fit=True)

    def simple_evaluate_heuristic(self, individual, X, y):
        return compute_sufs_non_incremental(
            features=[f.transform(X) for f in chain(*individual[:2])], y=y)

    def fitness(self, population, X, y):
        evaluation = []
        for individual in population:
            evaluation.append((individual, self.evaluate(individual, X, y)))
        return evaluation

    def generate_population(self):
        population = []
        for _ in range(self.individuals):
            individual = ([], [], set())
            if self.flexible_logic:
                n_chromosomes = range(random.randint(1, self.size))
            else:
                n_chromosomes = range(self.size)

            for _ in n_chromosomes:
                operand1_feature = random.randint(0, self.n_features - 1)
                operand2_feature = random.randint(0, self.n_features - 1)
                if operand1_feature == operand2_feature:
                    op = 'OR'
                    operand1_value = random.randint(
                        0, self.unique_values[operand1_feature] - 1)
                    operand2_value = random.randint(
                        0, self.unique_values[operand1_feature] - 1)
                else:
                    op = random.choice(('OR', 'XOR', 'AND'))
                    operand1_value = random.randint(
                        0, self.unique_values[operand1_feature] - 1)
                    operand2_value = random.randint(
                        0, self.unique_values[operand2_feature] - 1)
                operands = []
                operands.append((operand1_feature, operand1_value))
                operands.append((operand2_feature, operand2_value))
                individual[1].append(
                    create_feature(operator=op, operands=operands))
            n_og_features = random.randint(0, self.n_features - 1)
            features = list(range(self.n_features))
            for f in random.sample(features, n_og_features):
                individual[0].append(DummyFeatureConstructor(feature_index=f))
                individual[2].add(f)
            population.append(individual)
        return population

    def mutate_complex(self, population, **kwargs):
        new_population = []
        for individual in population:
            if random.random() < self.mutation_probability:
                chromosomes_index = []
                if self.flexible_logic:
                    if len(individual[1]) > 0:
                        chromosomes_index = random.sample(
                            list(range(len(individual[1]))),
                            random.randint(1, len(individual[1])))
                    else:
                        op = random.choice(('OR', 'XOR', 'AND'))
                        operands = []
                        for _ in range(2):
                            feature_index = random.randint(
                                0, self.n_features - 1)
                            value = random.randint(
                                0, self.unique_values[feature_index] - 1)
                            operands.append((feature_index, value))
                        individual[1].append(
                            create_feature(operator=op, operands=operands))
                        new_population.append(individual)
                        continue

                else:
                    chromosomes_index = random.sample(
                        list(range(len(individual[1]))),
                        random.randint(1, len(individual[1])))

                for i in range(len(chromosomes_index)):
                    index = chromosomes_index[i]
                    if not self.flexible_logic:
                        feature = individual[1][index]
                        feature.op = random.choice(('OR', 'XOR', 'AND'))
                        for operand in feature.operands:
                            operand.feature_index = random.randint(
                                0, self.n_features - 1)
                            operand.value = random.randint(
                                0,
                                self.unique_values[operand.feature_index] - 1)
                    else:
                        a = random.random()
                        if a < 0.33:
                            feature = individual[1][index]
                            feature.op = random.choice(('OR', 'XOR', 'AND'))
                            for operand in feature.operands:
                                operand.feature_index = random.randint(
                                    0, self.n_features - 1)
                                operand.value = random.randint(
                                    0,
                                    self.unique_values[operand.feature_index] -
                                    1)
                        elif a < 0.66:
                            op = random.choice(('OR', 'XOR', 'AND'))
                            operands = []
                            for _ in range(2):
                                feature_index = random.randint(
                                    0, self.n_features - 1)
                                value = random.randint(
                                    0, self.unique_values[feature_index] - 1)
                                operands.append((feature_index, value))
                            individual[1].append(
                                create_feature(operator=op, operands=operands))

                        else:
                            del individual[1][index]
                            chromosomes_index = [
                                j - 1 if j > index else j
                                for j in chromosomes_index
                            ]

            if random.random() < self.mutation_probability:
                a = random.random()
                og_features = individual[0]
                included_features = individual[2]
                if (a < 0.33 and len(og_features) < self.n_features
                    ) or len(og_features) == 0:
                    selected = random.choice(
                        tuple(
                            set(list(range(0, self.n_features))) -
                            included_features))
                    included_features.add(selected)
                    og_features.append(DummyFeatureConstructor(selected))
                elif a < 0.66 and len(og_features) < self.n_features and len(
                        og_features) > 0:
                    selected = random.choice(
                        tuple(
                            set(list(range(0, self.n_features))) -
                            included_features))
                    index = random.randint(0, len(og_features) - 1)
                    feature = og_features[index].feature_index
                    og_features[index] = DummyFeatureConstructor(selected)
                    included_features.remove(feature)
                    included_features.add(selected)
                else:
                    index = random.randint(0, len(og_features) - 1)
                    feature = og_features[index].feature_index
                    del og_features[index]
                    included_features.remove(feature)

            if len(individual[0]) == 0 and len(individual[1]) == 0:
                og_features = individual[0]
                included_features = individual[2]
                selected = random.choice(
                    tuple(
                        set(list(range(0, self.n_features))) -
                        included_features))
                included_features.add(selected)
                og_features.append(DummyFeatureConstructor(selected))
            new_population.append(individual)
        return new_population

    def mutate_simple(self, population, **kwargs):
        new_population = []
        for individual in population:
            if random.random() < self.mutation_probability:
                chromosomes_index = []
                if self.flexible_logic:
                    if len(individual[1]) > 0:
                        chromosomes_index = random.sample(
                            list(range(len(individual[1]))),
                            random.randint(1, len(individual[1])))
                    else:
                        op = random.choice(('OR', 'XOR', 'AND'))
                        operands = []
                        for _ in range(2):
                            feature_index = random.randint(
                                0, self.n_features - 1)
                            value = random.randint(
                                0, self.unique_values[feature_index] - 1)
                            operands.append((feature_index, value))
                        individual[1].append(
                            create_feature(operator=op, operands=operands))
                        new_population.append(individual)
                        continue

                else:
                    chromosomes_index = random.sample(
                        list(range(len(individual[1]))),
                        random.randint(1, len(individual[1])))

                for i in range(len(chromosomes_index)):
                    index = chromosomes_index[i]
                    feature = individual[1][index]
                    if not self.flexible_logic:
                        feature.op = random.choice(('OR', 'XOR', 'AND'))
                        for operand in feature.operands:
                            operand.feature_index = random.randint(
                                0, self.n_features - 1)
                            operand.value = random.randint(
                                0,
                                self.unique_values[operand.feature_index] - 1)
                    else:
                        a = random.random()
                        if a < 0.33:
                            b = random.random()
                            if b < 0.2:
                                # Change operatior
                                feature.op = random.choice(
                                    ('OR', 'XOR', 'AND'))
                            elif b < 0.4:
                                # Change full operand
                                operand = feature.operands[0]
                                operand.value = random.randint(
                                    0,
                                    self.unique_values[operand.feature_index] -
                                    1)
                            elif b < 0.6:
                                # Change full operand
                                operand = feature.operands[1]
                                operand.value = random.randint(
                                    0,
                                    self.unique_values[operand.feature_index] -
                                    1)
                            elif b < 0.8:
                                # Change value
                                operand = feature.operands[0]
                                operand.value = random.randint(
                                    0,
                                    self.unique_values[operand.feature_index] -
                                    1)
                            else:
                                # Change value
                                operand = feature.operands[1]
                                operand.value = random.randint(
                                    0,
                                    self.unique_values[operand.feature_index] -
                                    1)

                        elif a < 0.66:
                            # Add feature
                            op = random.choice(('OR', 'XOR', 'AND'))
                            operands = []
                            for _ in range(2):
                                feature_index = random.randint(
                                    0, self.n_features - 1)
                                value = random.randint(
                                    0, self.unique_values[feature_index] - 1)
                                operands.append((feature_index, value))
                            individual[1].append(
                                create_feature(operator=op, operands=operands))

                        else:
                            # Remove feature
                            del individual[1][index]
                            chromosomes_index = [
                                j - 1 if j > index else j
                                for j in chromosomes_index
                            ]

            if random.random() < self.mutation_probability:
                a = random.random()
                og_features = individual[0]
                included_features = individual[2]
                if (a < 0.33 and len(og_features) < self.n_features
                    ) or len(og_features) == 0:
                    selected = random.choice(
                        tuple(
                            set(list(range(0, self.n_features))) -
                            included_features))
                    included_features.add(selected)
                    og_features.append(DummyFeatureConstructor(selected))
                elif a < 0.66 and len(og_features) < self.n_features and len(
                        og_features) > 0:
                    selected = random.choice(
                        tuple(
                            set(list(range(0, self.n_features))) -
                            included_features))
                    index = random.randint(0, len(og_features) - 1)
                    feature = og_features[index].feature_index
                    og_features[index] = DummyFeatureConstructor(selected)
                    included_features.remove(feature)
                    included_features.add(selected)
                else:
                    index = random.randint(0, len(og_features) - 1)
                    feature = og_features[index].feature_index
                    del og_features[index]
                    included_features.remove(feature)

            if len(individual[0]) == 0 and len(individual[1]) == 0:
                og_features = individual[0]
                included_features = individual[2]
                selected = random.choice(
                    tuple(
                        set(list(range(0, self.n_features))) -
                        included_features))
                included_features.add(selected)
                og_features.append(DummyFeatureConstructor(selected))
            new_population.append(individual)
        return new_population

    def elitism(self, population1, population2):
        maximum = max(population1, key=lambda x: x[1])
        minimum_index = min(enumerate(population2), key=lambda x: x[1][1])[0]
        population2[minimum_index] = maximum
        return population2

    def truncation(self, population1, population2):
        return sorted(population1 + population2,
                      reverse=True,
                      key=lambda x: x[1])[:len(population1)]

    def select_population(self, population):
        selected_individuals = []
        num_selected = len(population)
        totalFitness = sum(fitness for _, fitness in population)
        for _ in range(num_selected):
            cumulative_prob = 0.0
            r = random.random()
            for individual_with_fitness in population:
                cumulative_prob += individual_with_fitness[1] / totalFitness
                if r <= cumulative_prob:
                    selected_individuals.append(
                        self.copy_individual(individual_with_fitness[0]))
                    break
        return selected_individuals

    def select_population_rank(self, population):
        selected_individuals = []
        num_selected = len(population)
        totalRank = (num_selected * (num_selected + 1)) / 2
        population.sort(reverse=True, key=lambda x: x[1])
        for _ in range(num_selected):
            cumulative_prob = 0.0
            r = random.random()
            for i, individual_with_fitness in enumerate(population, start=1):
                cumulative_prob += (num_selected - i + 1) / totalRank
                if r <= cumulative_prob:
                    selected_individuals.append(
                        self.copy_individual(individual_with_fitness[0]))
                    break
        return selected_individuals

    def copy_individual(self, individual):
        return ([chrms.copy() for chrms in individual[0]],
                [chrms.copy()
                 for chrms in individual[1]], individual[2].copy())

    def fit(self, X, y):
        self.feature_encoder_ = CustomOrdinalFeatureEncoder()
        self.class_encoder_ = CustomLabelEncoder()

        if isinstance(X, pd.DataFrame):
            self.categories_ = X.columns
        if self.encode_data:
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        classifier_ = NaiveBayes(encode_data=False,
                                 n_intervals=self.n_intervals,
                                 metric=self.metric)
        self.n_features = X.shape[1]
        if self.encode_data:
            self.unique_values = [
                values.shape[0] for values in self.feature_encoder_.categories_
            ]
        else:
            self.unique_values = [
                np.unique(X[:, j]).shape[0] for j in range(X.shape[1])
            ]
        random.seed(self.seed)
        np.random.seed(self.seed)
        self.size = np.ceil(np.sqrt(X.shape[1]))
        best_individual = self.execute_algorithm(X, y)
        self.best_features = best_individual
        self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
        self.classifier_.fit(
            np.concatenate(
                [feature.transform(X) for feature in self.best_features],
                axis=1), y)
        return self

    def execute_algorithm(self, X, y):
        if self.mixed:
            self.evaluate = self.evaluate_heuristic
        else:
            self.evaluate = self.evaluate_wrapper
        population = self.generate_population()
        population_with_fitness = self.fitness(population, X, y)
        iterator = tqdm(range(self.generations),
                        leave=False) if self.verbose else range(
                            self.generations)
        for generation in iterator:
            if self.mixed and generation > int(
                    self.generations * self.mixed_percentage
            ) and self.evaluate == self.evaluate_heuristic:
                self.evaluate = self.evaluate_wrapper
                # Reevaluate for fair combination
                population_with_fitness = self.fitness([
                    individual_with_fitness[0]
                    for individual_with_fitness in population_with_fitness
                ], X, y)
            selected_individuals = self.selection(population_with_fitness)
            crossed_individuals = selected_individuals  # self.crossover(selected_individuals)
            mutated_individuals = self.mutation(crossed_individuals, X=X, y=y)
            new_population = self.fitness(mutated_individuals, X, y)
            population_with_fitness = self.combine(population_with_fitness,
                                                   new_population)

            # Obtaining population's statistics
            if self.verbose:
                best, mean = get_max_mean(population_with_fitness)
                iterator.set_postfix({
                    "Generation":
                    generation,
                    "hit_count":
                    self.evaluate.hit_count,
                    "populationLength":
                    len(population_with_fitness),
                    "best fitness":
                    best,
                    "mean fitness":
                    mean
                })

        best_individual = max(population_with_fitness, key=lambda x: x[1])[0]
        return best_individual[0] + best_individual[1]

    def reset_evaluation(self):
        self.evaluate_wrapper = memoize_genetic(self.simple_evaluate)
        self.evaluate_heuristic = memoize_genetic(
            self.simple_evaluate_heuristic)

    def set_params(self, **params):
        super().set_params(**params)
        if "selection" in params:
            if params["selection"] not in ("rank", "proportionate"):
                raise ValueError(
                    "Unknown selection parameter expected one of : " +
                    str(tuple(["rank", "proportionate"])))
            self.selection = self.select_population_rank if "rank" in params[
                "selection"] else self.select_population
        if "combine" in params:
            if params["combine"] not in ("elitism", "truncate"):
                raise ValueError(
                    "Unknown selection parameter expected one of : " +
                    str(tuple(["elitism", "truncate"])))
            self.combine = self.elitism if "elit" in params[
                "combine"] else self.truncation
        if "mutation" in params:
            if params["mutation"] not in ("complex", "simple"):
                raise ValueError(
                    "Unknown selection parameter expected one of : " +
                    str(tuple(["complex", "simple"])))
            self.mutation = self.mutate_simple if "simple" == params[
                "mutation"] else self.mutate_complex

    def __init__(self,
                 seed=None,
                 individuals=1,
                 generations=40,
                 mutation_probability=0.2,
                 selection="rank",
                 mutation="simple",
                 combine="elitism",
                 n_intervals=5,
                 metric="accuracy",
                 flexible_logic=True,
                 verbose=False,
                 encode_data=True,
                 mixed=True,
                 mixed_percentage=0.5):
        self.mixed_percentage = mixed_percentage
        self.mixed = mixed
        self.encode_data = encode_data
        self.flexible_logic = flexible_logic
        self.verbose = verbose
        self.n_intervals = n_intervals
        self.metric = metric
        self.seed = seed
        self.individuals = individuals
        self.generations = generations
        self.mutation_probability = mutation_probability

        self.selection = selection
        self.combine = combine
        self.mutation = mutation

        allowed_selection = ('rank', 'proportionate')
        allowed_combine = ('elitism', 'truncate')
        allowed_mutation = ('complex', 'simple')

        if self.selection not in allowed_selection:
            raise ValueError(
                "Unknown selection type: %s, expected one of %s." %
                (self.selection, selection))
        if self.combine not in allowed_combine:
            raise ValueError("Unknown combine type: %s, expected one of %s." %
                             (self.combine, combine))
        if self.mutation not in allowed_mutation:
            raise ValueError(
                "Unknown selection type: %s, expected one of %s." %
                (self.mutation, mutation))

        self.selection = self.select_population_rank if "rank" in selection else self.select_population
        self.combine = self.elitism if "elit" in combine else self.truncation
        self.mutation = self.mutate_simple if "simple" in mutation else self.mutate_complex
        self.reset_evaluation()
Example #3
0
class RankerLogicalFeatureConstructor(TransformerMixin, ClassifierMixin,
                                      BaseEstimator):
    """First proposal: Hybrid-Ranker Wrapper.

    Build a ranking based on Symmetrical Uncertainty (SU) of every possible logical feature of depth 1
    (1 operator, 2 operands), using XOR, AND and OR operator. The steps are:
        - Find out combinations of values in database of every pair of features Xi, Xj:
            - Example: 
                - Xi = [1,2,3,2]
                - Xj = ['a','b','c','a']
                Possible combinations:
                    [(1,'a'),(2,'b'),(3,'c'),(2,'a')]
        - Apply operator to every combination:
            - Example: 
                - Xi = [1,2,3,2]
                - Xj = ['a','b','c','a']
                Possible combinations:
                    [(1,'a','AND'),(2,'b','AND'),(3,'c','AND'),(2,'a','AND'),
                    (1,'a','OR'),(2,'b','OR'),(3,'c','OR'),(2,'a','OR'),
                    (1,'a','XOR'),(2,'b','XOR'),(3,'c','XOR'),(2,'a','XOR')]
        - Add original variables to the list
        - Evaluate SU for every value in the list, and rank them
        - Go over the list following one of the two strategies proposed and evaluate 
          the subset based on a leave-one-out cross-validation with the NaiveBayes classifier.

    Parameters
    ----------
    strategy : str {eager,skip}
        After the ranking is built if the eager strategy is chosen we stop considering attributes
        when there is no improvement from one iteration to the next

    block_size : int, default=1
        Number of features that are added in each iteration

    encode_data : boolean
        Whether or not to encode the received data. If set to false the classifier 
        expects data to be encoded with an ordinal encoder.

    verbose : {boolean,int}
        If set to true it displays information of the remaining time 
        and inside variables.

    operators : array-like, deafult = ("XOR","AND","OR")
        Operators used for the constructed features.

    max_features : int, deafult = inf
        Maximum number of features to include in the selected subset

    max_iterations : int, deafult = inf
        Maximum number of iterations in the wrapper step.

    use_graph : bool, default = False 
        Generate Ranking from features obtained from the pruned-graph of the ACO algorithm.
        (Experimentation not carried out)

    use_initials: bool, default = False
        Force the set of initial features in the final solution. The set if trimmed with a backward elimination before-hand.

    Attributes
    ----------
    feature_encoder_ : CustomOrdinalFeatureEncoder or None
        Encodes data in ordinal way with unseen values handling if encode_data is set to True.

    class_encoder_ : LabelEncoder or None
        Encodes Data in ordinal way for the class if encode_data is set to True.

    all_feature_constructors: array-like
        List of FeatureConstructor objects with all the possible logical 
        features

    symmetrical_uncertainty_rank: array-like
        SU for every feature in all_feature_constructors

    rank : array-like
        Array of indexes corresponding to the sorted SU rank (in descending order).

    final_feature_constructors:
        Selected feature subset (list of constructors)

    classifier: NaiveBayes
        Classifier used in the wrapper and to perform predictions after fitting.

    """
    def __init__(self,
                 strategy="eager",
                 block_size=10,
                 encode_data=True,
                 n_intervals=5,
                 verbose=0,
                 operators=("AND", "OR", "XOR"),
                 max_features=float("inf"),
                 max_iterations=float("inf"),
                 metric="accuracy",
                 use_initials=False,
                 max_err=0,
                 prune=None,
                 use_graph=False):
        self.strategy = strategy
        self.block_size = max(block_size, 1)
        self.encode_data = encode_data
        self.verbose = verbose
        self.operators = operators
        self.max_features = max_features
        self.max_iterations = max_iterations
        self.n_intervals = n_intervals
        self.metric = metric
        self.max_err = max_err
        self.use_initials = use_initials
        self.prune = prune
        self.use_graph = use_graph

        allowed_strategies = ("eager", "skip")
        if self.strategy not in allowed_strategies:
            raise ValueError("Unknown operator type: %s, expected one of %s." %
                             (self.strategy, allowed_strategies))

    def fit(self, X, y):
        # Parse input
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)

        # Reset the stored results for new fit
        self.reset_evaluation()

        # Generate rank
        if self.use_graph:
            # Construct the minimum graph and create rank
            graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph(
                X, y, ("AND", "OR", "XOR"))
            self.all_feature_constructors = graph.get_rank()
        elif self.prune is not None:
            # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y)
            feature_combinations = list(
                combinations(list(range(X.shape[1])),
                             2)) + [(i, i) for i in range(X.shape[1])]
            rank_pairs = [
                symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y)
                for i, j in feature_combinations
            ]
            rank_pairs_index = np.argsort(rank_pairs)[::-1]

            # Create the unsorted list
            self.all_feature_constructors = []
            for index in rank_pairs_index[:self.prune]:
                i, j = feature_combinations[index]
                if i == j:
                    from tfg.feature_construction import create_feature
                    self.all_feature_constructors.extend([
                        create_feature("OR", [(i, n), (i, m)])
                        for n, m in combinations(np.unique(X[:, i]), 2)
                    ])
                else:
                    self.all_feature_constructors.extend(
                        construct_features(X[:, [i, j]],
                                           operators=self.operators,
                                           same_feature=False))
        else:
            # Create the unsorted list of all features
            self.all_feature_constructors = construct_features(
                X, operators=self.operators)
        if self.verbose:
            print(
                f"Total number of constructed features: {len(self.all_feature_constructors)}"
            )
        self.all_feature_constructors.extend(
            [DummyFeatureConstructor(j) for j in range(X.shape[1])])
        self.symmetrical_uncertainty_rank = []

        # Sort the ranking
        for feature_constructor in self.all_feature_constructors:
            feature = feature_constructor.transform(X)
            su = symmetrical_uncertainty(f1=feature.flatten(), f2=y)
            self.symmetrical_uncertainty_rank.append(su)

        # Store the descending order index
        self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1]

        # If the initial variables are
        if self.use_initials:
            classifier = NaiveBayes(encode_data=False,
                                    n_intervals=self.n_intervals,
                                    metric=self.metric)
            classifier.fit(X, y)
            current_features = [
                DummyFeatureConstructor(j) for j in range(X.shape[1])
            ]

            # Store the backward result to reuse it for other executions
            self.initial_backward_features = backward_search(
                X, y, current_features, classifier)

        # Feature Subset Selection (FSS) from the rank
        self.filter_features(X, y)
        return self

    def predict(self, X):
        X, _ = self.transform(X)
        if self.encode_data:
            return self.class_encoder_.inverse_transform(
                self.classifier.predict(X))
        return self.classifier.predict(X)

    def reset_evaluation(self):
        # Reset the memoize evaluations
        self.evaluate_leave_one_out_cross_val = memoize(evaluate_leave_one_out)

    def predict_proba(self, X):
        X, _ = self.transform(X)
        return self.classifier.predict_proba(X)

    def score(self, X, y):
        X, y = self.transform(X, y)
        return self.classifier.score(X, y)

    def filter_features(self, X, y):
        '''After the rank is built this perform the greedy wrapper search'''
        check_is_fitted(self)
        self.classifier = NaiveBayes(encode_data=False,
                                     n_intervals=self.n_intervals,
                                     metric=self.metric)
        current_score = np.NINF
        first_iteration = True
        current_features = []
        current_data = None
        if self.use_initials:
            # Original Features have already been taken into account
            rank_iter = filter(
                lambda x: not isinstance(self.all_feature_constructors[x],
                                         DummyFeatureConstructor),
                iter(self.rank))

            # Deep copy to avoid issues when modifying the list
            current_features = deepcopy(self.initial_backward_features)
            current_data = np.concatenate(
                [f.transform(X) for f in current_features], axis=1)

            # Get initial LOO score
            current_score = self.evaluate_leave_one_out_cross_val(
                self.classifier, current_features, current_data, y, fit=True)
        else:
            # Iterator over the sorted list of indexes
            rank_iter = iter(self.rank)

        if self.verbose:
            progress_bar = tqdm(total=len(self.rank),
                                bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        iteration = 0
        iterations_without_improvements = 0

        # Loop for including {block size} elements at a time
        # Rank is an iterator, so the for loop is not sequential!
        for feature_constructor_index in rank_iter:
            iteration += 1
            if self.verbose:
                progress_bar.set_postfix({
                    "n_features": len(current_features),
                    "score": current_score
                })
                progress_bar.update(1)
                progress_bar.refresh()

            # Add block size features
            new_X = [
                self.all_feature_constructors[feature_constructor_index].
                transform(X)
            ]
            selected_features = [
                self.all_feature_constructors[feature_constructor_index]
            ]
            for _ in range(self.block_size - 1):
                try:
                    index = next(rank_iter)
                    selected_features.append(
                        self.all_feature_constructors[index])
                    new_X.append(
                        self.all_feature_constructors[index].transform(X))
                    if self.verbose:
                        progress_bar.update(1)
                        progress_bar.refresh()
                except:
                    # Block size does not divide the number of elements in the rank. The search is halted
                    break

            # Evaluate features
            new_X = np.concatenate(new_X, axis=1)
            if iteration == 1 and not self.use_initials:
                current_data = new_X
                current_score = self.evaluate_leave_one_out_cross_val(
                    self.classifier,
                    selected_features,
                    current_data,
                    y,
                    fit=True)
                current_features = selected_features
                first_iteration = False
                if self.max_iterations <= iteration or (
                        len(current_features) +
                        self.block_size) > self.max_features:
                    break
                continue
            data = np.concatenate([current_data, new_X], axis=1)
            self.classifier.add_features(new_X, y)
            # LOO evaluation
            score = self.evaluate_leave_one_out_cross_val(self.classifier,
                                                          current_features +
                                                          selected_features,
                                                          data,
                                                          y,
                                                          fit=False)
            if score > current_score:
                current_score = score
                current_data = data
                current_features.extend(selected_features)
                iterations_without_improvements = 0
            else:
                iterations_without_improvements += 1
                # Remove last added block
                for feature_index_to_remove in range(
                        data.shape[1], data.shape[1] - new_X.shape[1], -1):
                    self.classifier.remove_feature(feature_index_to_remove - 1)
                if self.strategy == "eager" and self.max_err < iterations_without_improvements:
                    # Stops as soon as no impovement
                    break

            if self.max_iterations <= iteration or (
                    len(current_features) +
                    self.block_size) > self.max_features:
                break
        if self.verbose:
            progress_bar.close()
            print(
                f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}"
            )
        self.final_feature_constructors = current_features
        return self

    def transform(self, X, y=None):
        check_is_fitted(self)
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
            if y is not None:
                y = self.class_encoder_.transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        new_X = []
        for feature_constructor in self.final_feature_constructors:
            new_X.append(feature_constructor.transform(X))
        return np.concatenate(new_X, axis=1), y
Example #4
0
def acfs_score_comparison(datasets,
                          seed,
                          base_path,
                          params,
                          n_splits=3,
                          n_repeats=5,
                          n_intervals=5,
                          metric="accuracy",
                          send_email=False,
                          email_data=dict(),
                          verbose=True):
    # List to store results and column names for the csv
    result = []
    columns = [
        "Database", "Number of attributes", "NBScore", "NBScore STD",
        "ACFCS Score", "ACFCS Score STD", "Configuration", "Nodes",
        "Contruction Matrix", "Selection Matrix", "Selected_attributes",
        "Original"
    ]
    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    acfcs = ACFCS(verbose=0, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        # Update progressbar
        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_selection_matrix = np.zeros(shape=(len(params),
                                                 n_splits * n_repeats))
        acfcs_construction_matrix = np.zeros(shape=(len(params),
                                                    n_splits * n_repeats))
        acfcs_nodes = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_dummy = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_selected = np.zeros(shape=(len(params), n_splits * n_repeats))

        # Create splits for the experiments
        rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                       n_repeats=n_repeats,
                                       random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits * n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'
                         ) if verbose else rskf.split(X, y)

        # Execute experiments
        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Encode the data
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers reusing info to speed up evaluation
            nb.fit(X_train, y_train)
            naive_bayes_score = nb.score(X_test, y_test)
            acfcs.reset_cache()
            for conf_index, conf in enumerate(params):
                acfcs.set_params(**conf)
                acfcs.fit(X_train, y_train, init_graph=conf_index == 0)

                # score
                acfcs_score_conf = acfcs.score(X_test, y_test)
                if verbose:
                    seed_tqdm.set_postfix({
                        "config": conf_index,
                        "nb_score": naive_bayes_score,
                        "ant_score": acfcs_score_conf
                    })

                # Get data
                n_original_features = len(
                    list(
                        filter(
                            lambda x: isinstance(x, DummyFeatureConstructor),
                            acfcs.best_features)))
                n_selected = len(acfcs.best_features)
                selection_matrix = len(acfcs.afg.pheromone_selection)
                construction_matrix = len(acfcs.afg.pheromone_construction)
                nodes = len(acfcs.afg.nodes)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                acfcs_score[conf_index, i] = acfcs_score_conf
                acfcs_selection_matrix[conf_index, i] = selection_matrix
                acfcs_construction_matrix[conf_index, i] = construction_matrix
                acfcs_nodes[conf_index, i] = nodes
                acfcs_dummy[conf_index, i] = n_original_features
                acfcs_selected[conf_index, i] = n_selected

        # Insert the final result - averaged metrics for this database.
        for conf_index, conf in enumerate(params):
            row = [
                name, X.shape[1],
                np.mean(nb_score[conf_index]),
                np.std(nb_score[conf_index]),
                np.mean(acfcs_score[conf_index]),
                np.std(acfcs_score[conf_index]), conf,
                np.mean(acfcs_nodes[conf_index]),
                np.mean(acfcs_construction_matrix[conf_index]),
                np.mean(acfcs_selection_matrix[conf_index]),
                np.mean(acfcs_selected[conf_index]),
                np.mean(acfcs_dummy[conf_index])
            ]
            result.append(row)
    result = pd.DataFrame(result, columns=columns)

    if send_email:
        from tfg.utils import send_results
        send_results("ACFCS", email_data, result)
    return result
Example #5
0
def ranker_score_comparison(datasets,
                            seed,
                            base_path,
                            params,
                            n_splits=3,
                            n_repeats=5,
                            n_intervals=5,
                            metric="accuracy",
                            send_email=False,
                            email_data=dict(),
                            share_rank=True):
    result = []
    columns = ["Database",
               "Number of attributes",
               "NBScore",
               "NBScore STD",
               "Ranker Score",
               "Ranker Score STD",
               "Configuration",
               "Combinations",
               "Selected_attributes",
               "Original"]

    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    r = RankerLogicalFeatureConstructor(n_intervals=n_intervals, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_score = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_combinations = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_selected = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_dummy = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_total_constructed = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_total_selected = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_original_selected = np.zeros(shape=(len(params), n_splits*n_repeats))

        rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits*n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers
            nb.fit(X=X_train, y=y_train)
            naive_bayes_score = nb.score(X_test, y_test)

            for conf_index, conf in enumerate(params):
                seed_tqdm.set_postfix({"config": conf_index})
                r.set_params(**conf)
                # Fit
                if conf_index == 0 or not share_rank:
                    # The rank is computed from scratch
                    r.fit(X_train, y_train)
                else:
                    r.filter_features(r.feature_encoder_.transform(
                        X_train), r.class_encoder_.transform(y_train))

                # score
                ranker_score = r.score(X_test, y_test)

                # Get data
                n_original_features = len(list(filter(lambda x: isinstance(
                    x, DummyFeatureConstructor), r.final_feature_constructors)))
                n_combinations = len(r.all_feature_constructors)
                n_selected = len(r.final_feature_constructors)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                r_score[conf_index, i] = ranker_score
                r_combinations[conf_index, i] = n_combinations
                r_selected[conf_index, i] = n_selected
                r_dummy[conf_index, i] = n_original_features

        # Insert to final result averaged metrics for this dataset
        for conf_index, conf in enumerate(params):
            row = [name,
                   X.shape[1],
                   np.mean(nb_score[conf_index]),
                   np.std(nb_score[conf_index]),
                   np.mean(r_score[conf_index]),
                   np.std(r_score[conf_index]),
                   conf,
                   np.mean(r_combinations[conf_index]),
                   np.mean(r_selected[conf_index]),
                   np.mean(r_dummy[conf_index])]
            result.append(row)
    result = pd.DataFrame(result, columns=columns)
    if send_email:
        from tfg.utils import send_results
        send_results("RANKER", email_data, result)
    return result
Example #6
0
def genetic_score_comparison(datasets,
                             seed,
                             base_path,
                             params,
                             n_splits=3,
                             n_repeats=5,
                             n_intervals=5,
                             metric="accuracy",
                             send_email=False,
                             email_data=dict(),
                             verbose=True,
                             version=1):
    result = []
    columns = [
        "Database", "Number of attributes", "NBScore", "NBScore STD",
        "Genetic Score", "Genetic Score STD", "Configuration",
        "Selected_attributes", "Original"
    ]

    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    if version == 1:
        # First Version - No flexibility in the number of attributes (bad performance)
        # clf = GeneticProgramming(seed=seed, metric=metric)
        clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric)
    elif version == 2:
        # Version with flexibility
        clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric)
    else:
        # Guided mutation based on SU
        clf = GeneticProgrammingRankMutation(seed=seed, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_selected = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_dummy = np.zeros(shape=(len(params), n_splits * n_repeats))

        # Create splits for the experiments
        rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                       n_repeats=n_repeats,
                                       random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits * n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'
                         ) if verbose else rskf.split(X, y)

        # Execute experiments
        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Encode the data
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers reusing info to speed up evaluation
            nb.fit(X_train, y_train)
            naive_bayes_score = nb.score(X_test, y_test)

            # Reset evaluation-cache for new split
            clf.reset_evaluation()
            for conf_index, conf in enumerate(params):
                if verbose:
                    seed_tqdm.set_postfix({"config": conf_index})
                clf.set_params(**conf)
                clf.fit(X_train, y_train)

                # score
                genetic_score = clf.score(X_test, y_test)

                # Get data
                n_original_features = len(
                    list(
                        filter(
                            lambda x: isinstance(x, DummyFeatureConstructor),
                            clf.best_features)))
                n_selected = len(clf.best_features)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                clf_score[conf_index, i] = genetic_score
                clf_selected[conf_index, i] = n_selected
                clf_dummy[conf_index, i] = n_original_features

        # Insert to final result averaged metrics for this database
        for conf_index, conf in enumerate(params):
            row = [
                name, X.shape[1],
                np.mean(nb_score[conf_index]),
                np.std(nb_score[conf_index]),
                np.mean(clf_score[conf_index]),
                np.std(clf_score[conf_index]), conf,
                np.mean(clf_selected[conf_index]),
                np.mean(clf_dummy[conf_index])
            ]
            result.append(row)

    result = pd.DataFrame(result, columns=columns)
    if send_email:
        from tfg.utils import send_results
        send_results(f"GENETIC_{version}", email_data, result)
    return result
Example #7
0
class ACFCS(OptimizationMixin, TransformerMixin, ClassifierMixin,
            BaseEstimator):
    def __init__(self,
                 ants=10,
                 evaporation_rate=0.05,
                 intensification_factor=0.05,
                 alpha=1.0,
                 beta=0.0,
                 beta_evaporation_rate=0.05,
                 step=1,
                 iterations=100,
                 early_stopping=20,
                 update_strategy="best",
                 seed=None,
                 parallel=False,
                 save_features=False,
                 path=None,
                 filename=None,
                 verbose=0,
                 graph_strategy="mutual_info",
                 connections=2,
                 max_errors=0,
                 metric="accuracy",
                 use_initials=False,
                 final_selection="ALL",
                 encode_data=True):
        self.step = step
        self.ants = ants
        self.evaporation_rate = evaporation_rate
        self.intensification_factor = intensification_factor
        self.alpha = alpha
        self.beta = beta
        self.beta_evaporation_rate = beta_evaporation_rate
        self.iterations = iterations
        self.early_stopping = early_stopping
        self.seed = seed
        self.parallel = parallel
        self.save_features = save_features
        self.path = path
        self.filename = filename
        self.verbose = verbose
        self.graph_strategy = graph_strategy
        self.connections = connections
        self.metric = metric
        self.update_strategy = update_strategy
        self.use_initials = use_initials
        self.final_selection = final_selection
        self.encode_data = encode_data
        self.max_errors = max_errors
        allowed_graph_strategy = ("full", "mutual_info")
        if self.graph_strategy not in allowed_graph_strategy:
            raise ValueError(
                "Unknown graph strategy type: %s, expected one of %s." %
                (self.graph_strategy, allowed_graph_strategy))

        allowed_update_strategy = ("all", "best")
        if self.update_strategy not in allowed_update_strategy:
            raise ValueError(
                "Unknown graph strategy type: %s, expected one of %s." %
                (self.update_strategy, allowed_update_strategy))

        self.reset_cache()

    def reset_cache(self):
        self.cache_loo = dict()
        self.cache_heuristic = dict()

    def fit(self, X, y, init_graph=True):
        self.feature_encoder_ = CustomOrdinalFeatureEncoder()
        self.class_encoder_ = CustomLabelEncoder()

        self.categories_ = None
        if isinstance(X, pd.DataFrame):
            self.categories_ = X.columns
        if self.encode_data:
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)
        if init_graph:
            if self.graph_strategy == "full":
                #Full graph
                self.afg = AntFeatureGraph(seed=self.seed).compute_graph(
                    X, y, ("XOR", "OR", "AND"))
            else:
                #Pruned graph
                self.afg = AntFeatureGraphMI(
                    seed=self.seed,
                    connections=self.connections).compute_graph(
                        X, y, ("XOR", "OR", "AND"))
        else:
            self.afg.reset_pheromones()
        if self.verbose:
            print(f"Number of nodes: {len(self.afg.nodes)}")

        random.seed(self.seed)
        best_score = 0
        self.best_features = []
        iterations_without_improvement = 0
        iterator = tqdm(range(self.iterations)) if self.verbose else range(
            self.iterations)
        beta = self.beta
        distance_from_best = -1
        for iteration in iterator:
            if self.verbose:
                iterator.set_postfix({
                    "best_score":
                    best_score,
                    "n_features":
                    len(self.best_features),
                    "p_matrix_c":
                    len(self.afg.pheromone_construction),
                    "p_matrix_s":
                    len(self.afg.pheromone_selection),
                    "distance_from_best":
                    distance_from_best
                })
            ants = [
                Ant(ant_id=i,
                    alpha=self.alpha,
                    beta=beta,
                    metric=self.metric,
                    use_initials=self.use_initials,
                    cache_loo=self.cache_loo,
                    cache_heuristic=self.cache_heuristic,
                    step=self.step) for i in range(self.ants)
            ]
            beta *= (1 - self.beta_evaporation_rate)
            results = []
            for ant in ants:
                results.append(
                    ant.run(X=X,
                            y=y,
                            graph=self.afg,
                            random_generator=random,
                            parallel=self.parallel,
                            max_errors=self.max_errors))
            results = np.array(results)

            self.afg.update_pheromone_matrix_evaporation(self.evaporation_rate)
            distance_from_best = np.mean(np.abs(results - best_score))
            best_ant = np.argmax(results)
            if self.update_strategy == "best":
                ant = ants[best_ant]
                self.afg.intensify(ant.current_features,
                                   self.intensification_factor, 1,
                                   self.use_initials)
            else:
                for ant_score, ant in zip(results, ants):
                    self.afg.intensify(ant.current_features,
                                       self.intensification_factor, ant_score,
                                       self.use_initials)

            if results[best_ant] >= best_score:
                iterations_without_improvement = 0
                ant = ants[best_ant]
                best_score = results[best_ant]
                self.best_features = ant.current_features
            else:
                iterations_without_improvement += 1
                if iterations_without_improvement > self.early_stopping:
                    break

        self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
        if self.final_selection == "BEST":
            pass
        else:
            #An ant traverses the graph deterministically to obtain the features
            final_ant = FinalAnt(ant_id=0,
                                 alpha=self.alpha,
                                 beta=beta,
                                 metric=self.metric,
                                 use_initials=self.use_initials,
                                 cache_loo=self.cache_loo,
                                 cache_heuristic=self.cache_heuristic,
                                 step=self.step)
            final_ant.run(X=X,
                          y=y,
                          graph=self.afg,
                          random_generator=random,
                          parallel=self.parallel)
            self.best_features = final_ant.current_features
        #Train model with final features
        self.classifier_.fit(
            np.concatenate(
                [feature.transform(X) for feature in self.best_features],
                axis=1), y)

        if self.save_features:
            #Save to features to dict
            translate_features(features=self.best_features,
                               feature_encoder=self.feature_encoder_,
                               categories=self.categories_,
                               path=self.path,
                               filename=self.filename)
        return self
Example #8
0
def scoring_comparison(base_path,datasets,verbose=1,test_size=0.3,seed=None,n_iterations=30):
    column_names = ["dataset",
                    "custom_training_score",
                    "custom_test_score",
                    "categorical_training_score",
                    "categorical_test_score"]
    data =[]
    clf_no_encoding = NaiveBayes(encode_data=True)
    clf_categorical_sklearn = CategoricalNB()
    
    datasets_iter = tqdm(datasets, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')
    c = CustomOrdinalFeatureEncoder()
    l = CustomLabelEncoder()
    
    for dataset in datasets_iter:
        dataset_name, label = dataset
        data_filename = f"{dataset_name}.data.csv"
        test_filename = f"{dataset_name}.test.csv"
        X, y = get_X_y_from_database(base_path=base_path,
                                     name = dataset_name,
                                     data = data_filename, 
                                     test = test_filename, 
                                     label = label)
        custom_train = []
        custom_test = []

        sklearn_train = []
        sklearn_test = []


        X  = c.fit_transform(X)
        y  = l.fit_transform(y)
        for iteration in range(n_iterations):
            if verbose:
                datasets_iter.set_postfix({"Dataset": dataset_name, "seed":iteration})
                datasets_iter.refresh()
            try:
                X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                             test_size=test_size,
                                                             random_state=seed+iteration,
                                                             shuffle=True,
                                                             stratify=y)
            except:
                #Not enough values to stratify y
                X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                                test_size=test_size,
                                                                random_state=seed+iteration,
                                                                shuffle=True
                                                                )
            #Fit
            clf_no_encoding.fit(X_train,y_train)
            clf_categorical_sklearn.min_categories = [1+np.max(np.concatenate([X_train[:,j],X_test[:,j]])) for j in range(X_train.shape[1])]
            clf_categorical_sklearn.fit(X_train,y_train)
            
            
            #Predict
            custom_train.append(clf_no_encoding.score(X_train,y_train))
            custom_test.append(clf_no_encoding.score(X_test,y_test))
            sklearn_train.append(clf_categorical_sklearn.score(X_train,y_train))
            sklearn_test.append(clf_categorical_sklearn.score(X_test,y_test))
        data.append([dataset_name,np.mean(custom_train),np.mean(custom_test),np.mean(sklearn_train),np.mean(sklearn_test)])
    return pd.DataFrame(data,columns = column_names)
Example #9
0
class NaiveBayes(ClassifierMixin, BaseEstimator):
    """A Naive Bayes classifier.

    Simple NaiveBayes classifier accepting non-encoded input, enhanced with numba using MAP
    to predict most likely class.

    Parameters
    ----------
    alpha : {float, array-like}, default=1.0
        Additive (Laplace/Lidstone) smoothing parameter
        (0 for no smoothing). If it is an array it is 
        expected to have the same size as number of attributes

    encode_data : bool, default=True
        Encode data when data is not encoded by default with an OrdinalEncoder

    discretize : bool, default=True
        Discretize numerical data

    n_intervals : int or None, default=5
        Discretize numerical data using the specified number of intervals

    Attributes
    ----------
    feature_encoder_ : CustomOrdinalFeatureEncoder or None
        Encodes data in ordinal way with unseen values handling if encode_data is set to True.

    class_encoder_ : LabelEncoder or None
        Encodes Data in ordinal way for the class if encode_data is set to True.

    n_samples_ : int
        Number of samples  

    n_features_ : int
        Number of features

    n_classes_ : int
        Number of classes

    class_values_ : array-like of shape (n_classes_,)
        Array containing the values of the classes, as ordinal encoding is assumed it will be an array
        ranging from 0 to largest value for the class

    class_count_ : array-like of shape (n_classes_,)
        Array where `class_count_[i]` contains the count of the ith class value. 

    class_log_count_ : array-like of shape (n_classes_,)
        Array where `class_count_[i]` contains the log count of the ith class value. 

    feature_values_count_per_element_ : array-like of shape (column_count,~)
        Array where `feature_values_count_per_element_[i]` is an array where `feature_values_count_per_element_[i][j]`
        contains the count of the jth value for the ith feature. Assuming ordinal encoding, some values might be equal to 0

    feature_values_count_ : array-like of shape (column_count,)
        Array where `feature_values_count_per_element_[i]` is an integer with the number of possible values for the ith feature.

    feature_unique_values_count_ : array-like of shape (column_count,)
        Array where `feature_unique_values_count_[i]` is an integer with the number of unique seen values for the ith feature at
        fitting time. This is needed to compute the smoothing.

    total_probability_ : array-like of shape (n_classes,)
        Smoothing factor to be applied to the prediction. Array where `total_probability_[i]` if equal to
        class_count_[i] + alpha*feature_unique_values_count_

    self.indepent_term_ : array-like of shape (n_classes,)
        Independent term computed at fitting time. It includes the smoothing factor to be applied to the prediction and 
        the apriori probability.

    probabilities_ : array-like of shape (column_count,~)
        Array where `feature_values_count_per_element_[i]` is an array  of shape (where `feature_values_count_per_element_[i][j]`
        contains the count of the jth value for the ith feature. Assuming ordinal encoding, some values might be equal to 0
    """
    def __init__(self,
                 alpha=1.0,
                 encode_data=True,
                 n_intervals=5,
                 discretize=True,
                 metric="accuracy"):
        self.alpha = alpha
        self.encode_data = encode_data
        self.n_intervals = n_intervals
        self.discretize = discretize
        self.metric = metric
        self._get_scorer()
        super().__init__()

    def _get_scorer(self):
        self.scorer = get_scorer(self.metric)
        if self.metric == "f1_score":  # Unseen values for target class may cause errors
            self.scorer = lambda y_true, y_pred: get_scorer(self.metric)(
                y_true=y_true, y_pred=y_pred, average="macro", zero_division=0)

    def set_params(self, **params):
        super().set_params(**params)
        self._get_scorer()

    def _compute_independent_terms(self):
        """Computes the terms that are indepent of the prediction"""
        self.total_probability_ = compute_total_probability_(
            self.class_count_, self.feature_unique_values_count_, self.alpha)
        # self.total_probability_ = compute_total_probability_(self.class_count_,self.feature_values_count_,self.alpha) #-->scikit uses this
        self.indepent_term_ = self.class_log_count_smoothed_ - self.total_probability_

    def _compute_class_counts(self, X: np.ndarray, y: np.ndarray):
        """Computes the counts for the priors"""
        self.n_classes_ = 1 + np.max(y)
        self.class_count_ = np.bincount(y)
        self.class_log_count_ = np.log(self.class_count_)
        self.class_count_smoothed_ = (self.class_count_ + self.alpha)
        self.class_log_count_smoothed_ = np.log(self.class_count_smoothed_)

    def _compute_feature_counts(self, X: np.ndarray, y: np.ndarray):
        """Computes the conditional smoothed counts for each feature"""
        tables = _get_tables(X, y, self.n_classes_, self.alpha)
        self.smoothed_counts_ = tables[0]
        self.smoothed_log_counts_ = tables[1]
        self.feature_values_count_ = tables[2]
        self.feature_values_count_per_element_ = tables[3]
        self.feature_unique_values_count_ = tables[4]

    def fit(self, X: np.ndarray, y: np.ndarray):
        """ Fits the classifier with trainning data.

        Parameters
        ----------

        X : array-like of shape (n_samples, n_features_)
            Training array that must be encoded unless
            encode_data is set to True

        y : array-like of shape (n_samples,)
            Label of the class associated to each sample.

        Returns
        -------
        self : object
        """
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals, discretize=self.discretize)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)
        if X.dtype != int:
            X = X.astype(int)
        if y.dtype != int:
            y = y.astype(int)
        self.n_samples_, self.n_features_ = X.shape
        self._compute_class_counts(X, y)
        self._compute_feature_counts(X, y)
        self._compute_independent_terms()
        return self

    def predict(self, X: np.ndarray):
        """ Predicts the label of the samples based on the MAP.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        Returns
        -------
        y : array-like of shape (n_samples)
            Predicted label for each sample.
        """
        check_is_fitted(self)
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        if X.dtype != int:
            X = X.astype(int)
        check_array(X)
        log_probabilities = _predict(X, self.smoothed_log_counts_,
                                     self.feature_values_count_, self.alpha)
        log_probabilities += self.indepent_term_
        output = np.argmax(log_probabilities, axis=1)
        if self.encode_data:
            output = self.class_encoder_.inverse_transform(output)
        return output

    def predict_proba(self, X: np.ndarray):
        """ Predicts the probability for each label of the samples based on the MAP.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        Returns
        -------
        y : array-like of shape (n_classes,n_samples)
            Array where `y[i][j]` contains the MAP of the jth class for ith
            sample
        """
        check_is_fitted(self)
        if X.dtype != int:
            X = X.astype(int)
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        log_probabilities = _predict(X, self.smoothed_log_counts_,
                                     self.feature_values_count_, self.alpha)
        log_probabilities += self.indepent_term_
        log_prob_x = logsumexp(log_probabilities, axis=1)
        return np.exp(log_probabilities - np.atleast_2d(log_prob_x).T)

    def leave_one_out_cross_val(self, X, y, fit=True):
        """Efficient LOO computation"""
        if fit:
            self.fit(X, y)
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
            y = self.class_encoder_.transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()

        if X.dtype != int:
            X = X.astype(int)
        if y.dtype != int:
            X = X.astype(int)
        log_alpha = np.log(self.alpha)
        log_proba = np.zeros((X.shape[0], self.n_classes_))
        for i in range(X.shape[0]):
            example, label = X[i], y[i]
            class_count_ = self.class_count_.copy()
            class_count_[label] -= 1
            log_proba[i] = np.log(class_count_ + self.alpha)
            for j in range(X.shape[1]):
                p = self.smoothed_log_counts_[j][example[j]].copy()
                p[label] = np.log(
                    np.max([
                        self.smoothed_counts_[j][example[j]][label] - 1,
                        self.alpha
                    ]))
                log_proba[i] += p
                if self.feature_values_count_per_element_[j][example[j]] == 1:
                    update_value = np.log(class_count_ + (
                        self.feature_unique_values_count_[j] - 1) * self.alpha)
                else:
                    update_value = np.log(
                        class_count_ +
                        (self.feature_unique_values_count_[j]) * self.alpha)
                log_proba[i] -= np.where(update_value == np.NINF, 0,
                                         update_value)
        y_pred = np.argmax(log_proba, axis=1)
        return self.scorer(y, y_pred)

    def add_features(self, X, y, index=None):
        """Updates classifier with new features

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        y : array-like of shape (n_samples,)
            Label of the class associated to each sample.

        index: {None,array-like of shape (X.shape[1])}
                Indicates where to insert each new feature, if it is None
                they are all appended at the very end.
        Returns
        -------
        self : object
        """
        check_is_fitted(self)
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            # y should be the same than the one that was first fitted for now  ----> FUTURE IMPLEMENTATION
            y = self.class_encoder_.transform(y)
            X = self.feature_encoder_.add_features(X,
                                                   transform=True,
                                                   index=index)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)
        if X.dtype != int:
            X = X.astype(int)
        if y.dtype != int:
            X = X.astype(int)

        self.n_features_ += X.shape[1]
        tables = _get_tables(X, y, self.n_classes_, self.alpha)
        new_smoothed_counts = tables[0]
        new_smoothed_log_counts = tables[1]
        new_feature_value_counts = tables[2]
        new_feature_value_counts_per_element = tables[3]
        new_feature_unique_values_count_ = tables[4]
        new_feature_contribution = compute_total_probability_(
            self.class_count_, new_feature_unique_values_count_, self.alpha)
        if index:
            sort_index = np.argsort(index)
            index_with_column = list(enumerate(index))
            for i in sort_index:
                column, list_insert_index = index_with_column[i]
                self.feature_values_count_per_element_.insert(
                    list_insert_index,
                    new_feature_value_counts_per_element[column])
                self.feature_values_count_ = np.insert(
                    self.feature_values_count_, list_insert_index,
                    new_feature_value_counts[column])
                self.smoothed_counts_.insert(list_insert_index,
                                             new_smoothed_counts[column])
                self.smoothed_log_counts_.insert(
                    list_insert_index, new_smoothed_log_counts[column])
                self.feature_unique_values_count_ = np.insert(
                    self.feature_unique_values_count_, list_insert_index,
                    new_feature_unique_values_count_[column])
        else:
            self.feature_values_count_per_element_.extend(
                new_feature_value_counts_per_element)
            self.feature_values_count_ = np.concatenate(
                [self.feature_values_count_, new_feature_value_counts])
            self.smoothed_counts_.extend(new_smoothed_counts)
            self.smoothed_log_counts_.extend(new_smoothed_log_counts)
            self.feature_unique_values_count_ = np.concatenate([
                self.feature_unique_values_count_,
                new_feature_unique_values_count_
            ])

        self.total_probability_ += new_feature_contribution
        self.indepent_term_ -= new_feature_contribution

        return self

    def remove_feature(self, index):
        """Updates classifierby removing one feature (index)"""
        check_is_fitted(self)
        if self.n_features_ <= 1:
            raise Exception("Cannot remove only feature from classifier")
        if not 0 <= index < self.n_features_:
            raise Exception(
                f"Feature index not valid, expected index between 0 and {self.n_features_}"
            )
        self.n_features_ -= 1

        feature_contribution = self.class_count_ + self.alpha * self.feature_unique_values_count_[
            index]
        feature_contribution = np.log(feature_contribution)
        self.total_probability_ -= feature_contribution
        self.indepent_term_ += feature_contribution

        self.feature_unique_values_count_ = np.delete(
            self.feature_unique_values_count_, index)
        self.feature_values_count_ = np.delete(self.feature_values_count_,
                                               index)
        del self.feature_values_count_per_element_[index]
        del self.smoothed_counts_[index]
        del self.smoothed_log_counts_[index]

        if self.encode_data:
            self.feature_encoder_.remove_feature(index)
        return self

    def score(self, X: np.ndarray, y: np.ndarray):
        """Computes the accuracy

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        y : array-like of shape (n_samples,)
            Label of the class associated to each sample.
        Returns
        -------
        score : float
                Percentage of correctly classified instances
        """
        y_pred = self.predict(X)
        return self.scorer(y, y_pred)