Python CustomLabelEncoder.transform Examples

Programming Language: Python

Namespace/Package Name: tfg.encoder

Method/Function: transform

Examples at hotexamples.com: 5

Python CustomLabelEncoder.transform - 5 examples found. These are the top rated real world Python examples of tfg.encoder.CustomLabelEncoder.transform extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CustomLabelEncoder(8)

fit_transform(8)

transform(5)

inverse_transform(2)

Example #1

Show file

def acfs_score_comparison(datasets,
                          seed,
                          base_path,
                          params,
                          n_splits=3,
                          n_repeats=5,
                          n_intervals=5,
                          metric="accuracy",
                          send_email=False,
                          email_data=dict(),
                          verbose=True):
    # List to store results and column names for the csv
    result = []
    columns = [
        "Database", "Number of attributes", "NBScore", "NBScore STD",
        "ACFCS Score", "ACFCS Score STD", "Configuration", "Nodes",
        "Contruction Matrix", "Selection Matrix", "Selected_attributes",
        "Original"
    ]
    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    acfcs = ACFCS(verbose=0, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        # Update progressbar
        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_selection_matrix = np.zeros(shape=(len(params),
                                                 n_splits * n_repeats))
        acfcs_construction_matrix = np.zeros(shape=(len(params),
                                                    n_splits * n_repeats))
        acfcs_nodes = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_dummy = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_selected = np.zeros(shape=(len(params), n_splits * n_repeats))

        # Create splits for the experiments
        rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                       n_repeats=n_repeats,
                                       random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits * n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'
                         ) if verbose else rskf.split(X, y)

        # Execute experiments
        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Encode the data
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers reusing info to speed up evaluation
            nb.fit(X_train, y_train)
            naive_bayes_score = nb.score(X_test, y_test)
            acfcs.reset_cache()
            for conf_index, conf in enumerate(params):
                acfcs.set_params(**conf)
                acfcs.fit(X_train, y_train, init_graph=conf_index == 0)

                # score
                acfcs_score_conf = acfcs.score(X_test, y_test)
                if verbose:
                    seed_tqdm.set_postfix({
                        "config": conf_index,
                        "nb_score": naive_bayes_score,
                        "ant_score": acfcs_score_conf
                    })

                # Get data
                n_original_features = len(
                    list(
                        filter(
                            lambda x: isinstance(x, DummyFeatureConstructor),
                            acfcs.best_features)))
                n_selected = len(acfcs.best_features)
                selection_matrix = len(acfcs.afg.pheromone_selection)
                construction_matrix = len(acfcs.afg.pheromone_construction)
                nodes = len(acfcs.afg.nodes)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                acfcs_score[conf_index, i] = acfcs_score_conf
                acfcs_selection_matrix[conf_index, i] = selection_matrix
                acfcs_construction_matrix[conf_index, i] = construction_matrix
                acfcs_nodes[conf_index, i] = nodes
                acfcs_dummy[conf_index, i] = n_original_features
                acfcs_selected[conf_index, i] = n_selected

        # Insert the final result - averaged metrics for this database.
        for conf_index, conf in enumerate(params):
            row = [
                name, X.shape[1],
                np.mean(nb_score[conf_index]),
                np.std(nb_score[conf_index]),
                np.mean(acfcs_score[conf_index]),
                np.std(acfcs_score[conf_index]), conf,
                np.mean(acfcs_nodes[conf_index]),
                np.mean(acfcs_construction_matrix[conf_index]),
                np.mean(acfcs_selection_matrix[conf_index]),
                np.mean(acfcs_selected[conf_index]),
                np.mean(acfcs_dummy[conf_index])
            ]
            result.append(row)
    result = pd.DataFrame(result, columns=columns)

    if send_email:
        from tfg.utils import send_results
        send_results("ACFCS", email_data, result)
    return result

Example #2

Show file

class RankerLogicalFeatureConstructor(TransformerMixin, ClassifierMixin,
                                      BaseEstimator):
    """First proposal: Hybrid-Ranker Wrapper.

    Build a ranking based on Symmetrical Uncertainty (SU) of every possible logical feature of depth 1
    (1 operator, 2 operands), using XOR, AND and OR operator. The steps are:
        - Find out combinations of values in database of every pair of features Xi, Xj:
            - Example: 
                - Xi = [1,2,3,2]
                - Xj = ['a','b','c','a']
                Possible combinations:
                    [(1,'a'),(2,'b'),(3,'c'),(2,'a')]
        - Apply operator to every combination:
            - Example: 
                - Xi = [1,2,3,2]
                - Xj = ['a','b','c','a']
                Possible combinations:
                    [(1,'a','AND'),(2,'b','AND'),(3,'c','AND'),(2,'a','AND'),
                    (1,'a','OR'),(2,'b','OR'),(3,'c','OR'),(2,'a','OR'),
                    (1,'a','XOR'),(2,'b','XOR'),(3,'c','XOR'),(2,'a','XOR')]
        - Add original variables to the list
        - Evaluate SU for every value in the list, and rank them
        - Go over the list following one of the two strategies proposed and evaluate 
          the subset based on a leave-one-out cross-validation with the NaiveBayes classifier.

    Parameters
    ----------
    strategy : str {eager,skip}
        After the ranking is built if the eager strategy is chosen we stop considering attributes
        when there is no improvement from one iteration to the next

    block_size : int, default=1
        Number of features that are added in each iteration

    encode_data : boolean
        Whether or not to encode the received data. If set to false the classifier 
        expects data to be encoded with an ordinal encoder.

    verbose : {boolean,int}
        If set to true it displays information of the remaining time 
        and inside variables.

    operators : array-like, deafult = ("XOR","AND","OR")
        Operators used for the constructed features.

    max_features : int, deafult = inf
        Maximum number of features to include in the selected subset

    max_iterations : int, deafult = inf
        Maximum number of iterations in the wrapper step.

    use_graph : bool, default = False 
        Generate Ranking from features obtained from the pruned-graph of the ACO algorithm.
        (Experimentation not carried out)

    use_initials: bool, default = False
        Force the set of initial features in the final solution. The set if trimmed with a backward elimination before-hand.

    Attributes
    ----------
    feature_encoder_ : CustomOrdinalFeatureEncoder or None
        Encodes data in ordinal way with unseen values handling if encode_data is set to True.

    class_encoder_ : LabelEncoder or None
        Encodes Data in ordinal way for the class if encode_data is set to True.

    all_feature_constructors: array-like
        List of FeatureConstructor objects with all the possible logical 
        features

    symmetrical_uncertainty_rank: array-like
        SU for every feature in all_feature_constructors

    rank : array-like
        Array of indexes corresponding to the sorted SU rank (in descending order).

    final_feature_constructors:
        Selected feature subset (list of constructors)

    classifier: NaiveBayes
        Classifier used in the wrapper and to perform predictions after fitting.

    """
    def __init__(self,
                 strategy="eager",
                 block_size=10,
                 encode_data=True,
                 n_intervals=5,
                 verbose=0,
                 operators=("AND", "OR", "XOR"),
                 max_features=float("inf"),
                 max_iterations=float("inf"),
                 metric="accuracy",
                 use_initials=False,
                 max_err=0,
                 prune=None,
                 use_graph=False):
        self.strategy = strategy
        self.block_size = max(block_size, 1)
        self.encode_data = encode_data
        self.verbose = verbose
        self.operators = operators
        self.max_features = max_features
        self.max_iterations = max_iterations
        self.n_intervals = n_intervals
        self.metric = metric
        self.max_err = max_err
        self.use_initials = use_initials
        self.prune = prune
        self.use_graph = use_graph

        allowed_strategies = ("eager", "skip")
        if self.strategy not in allowed_strategies:
            raise ValueError("Unknown operator type: %s, expected one of %s." %
                             (self.strategy, allowed_strategies))

    def fit(self, X, y):
        # Parse input
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)

        # Reset the stored results for new fit
        self.reset_evaluation()

        # Generate rank
        if self.use_graph:
            # Construct the minimum graph and create rank
            graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph(
                X, y, ("AND", "OR", "XOR"))
            self.all_feature_constructors = graph.get_rank()
        elif self.prune is not None:
            # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y)
            feature_combinations = list(
                combinations(list(range(X.shape[1])),
                             2)) + [(i, i) for i in range(X.shape[1])]
            rank_pairs = [
                symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y)
                for i, j in feature_combinations
            ]
            rank_pairs_index = np.argsort(rank_pairs)[::-1]

            # Create the unsorted list
            self.all_feature_constructors = []
            for index in rank_pairs_index[:self.prune]:
                i, j = feature_combinations[index]
                if i == j:
                    from tfg.feature_construction import create_feature
                    self.all_feature_constructors.extend([
                        create_feature("OR", [(i, n), (i, m)])
                        for n, m in combinations(np.unique(X[:, i]), 2)
                    ])
                else:
                    self.all_feature_constructors.extend(
                        construct_features(X[:, [i, j]],
                                           operators=self.operators,
                                           same_feature=False))
        else:
            # Create the unsorted list of all features
            self.all_feature_constructors = construct_features(
                X, operators=self.operators)
        if self.verbose:
            print(
                f"Total number of constructed features: {len(self.all_feature_constructors)}"
            )
        self.all_feature_constructors.extend(
            [DummyFeatureConstructor(j) for j in range(X.shape[1])])
        self.symmetrical_uncertainty_rank = []

        # Sort the ranking
        for feature_constructor in self.all_feature_constructors:
            feature = feature_constructor.transform(X)
            su = symmetrical_uncertainty(f1=feature.flatten(), f2=y)
            self.symmetrical_uncertainty_rank.append(su)

        # Store the descending order index
        self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1]

        # If the initial variables are
        if self.use_initials:
            classifier = NaiveBayes(encode_data=False,
                                    n_intervals=self.n_intervals,
                                    metric=self.metric)
            classifier.fit(X, y)
            current_features = [
                DummyFeatureConstructor(j) for j in range(X.shape[1])
            ]

            # Store the backward result to reuse it for other executions
            self.initial_backward_features = backward_search(
                X, y, current_features, classifier)

        # Feature Subset Selection (FSS) from the rank
        self.filter_features(X, y)
        return self

    def predict(self, X):
        X, _ = self.transform(X)
        if self.encode_data:
            return self.class_encoder_.inverse_transform(
                self.classifier.predict(X))
        return self.classifier.predict(X)

    def reset_evaluation(self):
        # Reset the memoize evaluations
        self.evaluate_leave_one_out_cross_val = memoize(evaluate_leave_one_out)

    def predict_proba(self, X):
        X, _ = self.transform(X)
        return self.classifier.predict_proba(X)

    def score(self, X, y):
        X, y = self.transform(X, y)
        return self.classifier.score(X, y)

    def filter_features(self, X, y):
        '''After the rank is built this perform the greedy wrapper search'''
        check_is_fitted(self)
        self.classifier = NaiveBayes(encode_data=False,
                                     n_intervals=self.n_intervals,
                                     metric=self.metric)
        current_score = np.NINF
        first_iteration = True
        current_features = []
        current_data = None
        if self.use_initials:
            # Original Features have already been taken into account
            rank_iter = filter(
                lambda x: not isinstance(self.all_feature_constructors[x],
                                         DummyFeatureConstructor),
                iter(self.rank))

            # Deep copy to avoid issues when modifying the list
            current_features = deepcopy(self.initial_backward_features)
            current_data = np.concatenate(
                [f.transform(X) for f in current_features], axis=1)

            # Get initial LOO score
            current_score = self.evaluate_leave_one_out_cross_val(
                self.classifier, current_features, current_data, y, fit=True)
        else:
            # Iterator over the sorted list of indexes
            rank_iter = iter(self.rank)

        if self.verbose:
            progress_bar = tqdm(total=len(self.rank),
                                bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        iteration = 0
        iterations_without_improvements = 0

        # Loop for including {block size} elements at a time
        # Rank is an iterator, so the for loop is not sequential!
        for feature_constructor_index in rank_iter:
            iteration += 1
            if self.verbose:
                progress_bar.set_postfix({
                    "n_features": len(current_features),
                    "score": current_score
                })
                progress_bar.update(1)
                progress_bar.refresh()

            # Add block size features
            new_X = [
                self.all_feature_constructors[feature_constructor_index].
                transform(X)
            ]
            selected_features = [
                self.all_feature_constructors[feature_constructor_index]
            ]
            for _ in range(self.block_size - 1):
                try:
                    index = next(rank_iter)
                    selected_features.append(
                        self.all_feature_constructors[index])
                    new_X.append(
                        self.all_feature_constructors[index].transform(X))
                    if self.verbose:
                        progress_bar.update(1)
                        progress_bar.refresh()
                except:
                    # Block size does not divide the number of elements in the rank. The search is halted
                    break

            # Evaluate features
            new_X = np.concatenate(new_X, axis=1)
            if iteration == 1 and not self.use_initials:
                current_data = new_X
                current_score = self.evaluate_leave_one_out_cross_val(
                    self.classifier,
                    selected_features,
                    current_data,
                    y,
                    fit=True)
                current_features = selected_features
                first_iteration = False
                if self.max_iterations <= iteration or (
                        len(current_features) +
                        self.block_size) > self.max_features:
                    break
                continue
            data = np.concatenate([current_data, new_X], axis=1)
            self.classifier.add_features(new_X, y)
            # LOO evaluation
            score = self.evaluate_leave_one_out_cross_val(self.classifier,
                                                          current_features +
                                                          selected_features,
                                                          data,
                                                          y,
                                                          fit=False)
            if score > current_score:
                current_score = score
                current_data = data
                current_features.extend(selected_features)
                iterations_without_improvements = 0
            else:
                iterations_without_improvements += 1
                # Remove last added block
                for feature_index_to_remove in range(
                        data.shape[1], data.shape[1] - new_X.shape[1], -1):
                    self.classifier.remove_feature(feature_index_to_remove - 1)
                if self.strategy == "eager" and self.max_err < iterations_without_improvements:
                    # Stops as soon as no impovement
                    break

            if self.max_iterations <= iteration or (
                    len(current_features) +
                    self.block_size) > self.max_features:
                break
        if self.verbose:
            progress_bar.close()
            print(
                f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}"
            )
        self.final_feature_constructors = current_features
        return self

    def transform(self, X, y=None):
        check_is_fitted(self)
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
            if y is not None:
                y = self.class_encoder_.transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        new_X = []
        for feature_constructor in self.final_feature_constructors:
            new_X.append(feature_constructor.transform(X))
        return np.concatenate(new_X, axis=1), y

Example #3

Show file

def genetic_score_comparison(datasets,
                             seed,
                             base_path,
                             params,
                             n_splits=3,
                             n_repeats=5,
                             n_intervals=5,
                             metric="accuracy",
                             send_email=False,
                             email_data=dict(),
                             verbose=True,
                             version=1):
    result = []
    columns = [
        "Database", "Number of attributes", "NBScore", "NBScore STD",
        "Genetic Score", "Genetic Score STD", "Configuration",
        "Selected_attributes", "Original"
    ]

    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    if version == 1:
        # First Version - No flexibility in the number of attributes (bad performance)
        # clf = GeneticProgramming(seed=seed, metric=metric)
        clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric)
    elif version == 2:
        # Version with flexibility
        clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric)
    else:
        # Guided mutation based on SU
        clf = GeneticProgrammingRankMutation(seed=seed, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_selected = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_dummy = np.zeros(shape=(len(params), n_splits * n_repeats))

        # Create splits for the experiments
        rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                       n_repeats=n_repeats,
                                       random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits * n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'
                         ) if verbose else rskf.split(X, y)

        # Execute experiments
        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Encode the data
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers reusing info to speed up evaluation
            nb.fit(X_train, y_train)
            naive_bayes_score = nb.score(X_test, y_test)

            # Reset evaluation-cache for new split
            clf.reset_evaluation()
            for conf_index, conf in enumerate(params):
                if verbose:
                    seed_tqdm.set_postfix({"config": conf_index})
                clf.set_params(**conf)
                clf.fit(X_train, y_train)

                # score
                genetic_score = clf.score(X_test, y_test)

                # Get data
                n_original_features = len(
                    list(
                        filter(
                            lambda x: isinstance(x, DummyFeatureConstructor),
                            clf.best_features)))
                n_selected = len(clf.best_features)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                clf_score[conf_index, i] = genetic_score
                clf_selected[conf_index, i] = n_selected
                clf_dummy[conf_index, i] = n_original_features

        # Insert to final result averaged metrics for this database
        for conf_index, conf in enumerate(params):
            row = [
                name, X.shape[1],
                np.mean(nb_score[conf_index]),
                np.std(nb_score[conf_index]),
                np.mean(clf_score[conf_index]),
                np.std(clf_score[conf_index]), conf,
                np.mean(clf_selected[conf_index]),
                np.mean(clf_dummy[conf_index])
            ]
            result.append(row)

    result = pd.DataFrame(result, columns=columns)
    if send_email:
        from tfg.utils import send_results
        send_results(f"GENETIC_{version}", email_data, result)
    return result

Example #4

Show file

def ranker_score_comparison(datasets,
                            seed,
                            base_path,
                            params,
                            n_splits=3,
                            n_repeats=5,
                            n_intervals=5,
                            metric="accuracy",
                            send_email=False,
                            email_data=dict(),
                            share_rank=True):
    result = []
    columns = ["Database",
               "Number of attributes",
               "NBScore",
               "NBScore STD",
               "Ranker Score",
               "Ranker Score STD",
               "Configuration",
               "Combinations",
               "Selected_attributes",
               "Original"]

    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    r = RankerLogicalFeatureConstructor(n_intervals=n_intervals, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_score = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_combinations = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_selected = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_dummy = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_total_constructed = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_total_selected = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_original_selected = np.zeros(shape=(len(params), n_splits*n_repeats))

        rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits*n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers
            nb.fit(X=X_train, y=y_train)
            naive_bayes_score = nb.score(X_test, y_test)

            for conf_index, conf in enumerate(params):
                seed_tqdm.set_postfix({"config": conf_index})
                r.set_params(**conf)
                # Fit
                if conf_index == 0 or not share_rank:
                    # The rank is computed from scratch
                    r.fit(X_train, y_train)
                else:
                    r.filter_features(r.feature_encoder_.transform(
                        X_train), r.class_encoder_.transform(y_train))

                # score
                ranker_score = r.score(X_test, y_test)

                # Get data
                n_original_features = len(list(filter(lambda x: isinstance(
                    x, DummyFeatureConstructor), r.final_feature_constructors)))
                n_combinations = len(r.all_feature_constructors)
                n_selected = len(r.final_feature_constructors)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                r_score[conf_index, i] = ranker_score
                r_combinations[conf_index, i] = n_combinations
                r_selected[conf_index, i] = n_selected
                r_dummy[conf_index, i] = n_original_features

        # Insert to final result averaged metrics for this dataset
        for conf_index, conf in enumerate(params):
            row = [name,
                   X.shape[1],
                   np.mean(nb_score[conf_index]),
                   np.std(nb_score[conf_index]),
                   np.mean(r_score[conf_index]),
                   np.std(r_score[conf_index]),
                   conf,
                   np.mean(r_combinations[conf_index]),
                   np.mean(r_selected[conf_index]),
                   np.mean(r_dummy[conf_index])]
            result.append(row)
    result = pd.DataFrame(result, columns=columns)
    if send_email:
        from tfg.utils import send_results
        send_results("RANKER", email_data, result)
    return result

Example #5

Show file

class NaiveBayes(ClassifierMixin, BaseEstimator):
    """A Naive Bayes classifier.

    Simple NaiveBayes classifier accepting non-encoded input, enhanced with numba using MAP
    to predict most likely class.

    Parameters
    ----------
    alpha : {float, array-like}, default=1.0
        Additive (Laplace/Lidstone) smoothing parameter
        (0 for no smoothing). If it is an array it is 
        expected to have the same size as number of attributes

    encode_data : bool, default=True
        Encode data when data is not encoded by default with an OrdinalEncoder

    discretize : bool, default=True
        Discretize numerical data

    n_intervals : int or None, default=5
        Discretize numerical data using the specified number of intervals

    Attributes
    ----------
    feature_encoder_ : CustomOrdinalFeatureEncoder or None
        Encodes data in ordinal way with unseen values handling if encode_data is set to True.

    class_encoder_ : LabelEncoder or None
        Encodes Data in ordinal way for the class if encode_data is set to True.

    n_samples_ : int
        Number of samples  

    n_features_ : int
        Number of features

    n_classes_ : int
        Number of classes

    class_values_ : array-like of shape (n_classes_,)
        Array containing the values of the classes, as ordinal encoding is assumed it will be an array
        ranging from 0 to largest value for the class

    class_count_ : array-like of shape (n_classes_,)
        Array where `class_count_[i]` contains the count of the ith class value. 

    class_log_count_ : array-like of shape (n_classes_,)
        Array where `class_count_[i]` contains the log count of the ith class value. 

    feature_values_count_per_element_ : array-like of shape (column_count,~)
        Array where `feature_values_count_per_element_[i]` is an array where `feature_values_count_per_element_[i][j]`
        contains the count of the jth value for the ith feature. Assuming ordinal encoding, some values might be equal to 0

    feature_values_count_ : array-like of shape (column_count,)
        Array where `feature_values_count_per_element_[i]` is an integer with the number of possible values for the ith feature.

    feature_unique_values_count_ : array-like of shape (column_count,)
        Array where `feature_unique_values_count_[i]` is an integer with the number of unique seen values for the ith feature at
        fitting time. This is needed to compute the smoothing.

    total_probability_ : array-like of shape (n_classes,)
        Smoothing factor to be applied to the prediction. Array where `total_probability_[i]` if equal to
        class_count_[i] + alpha*feature_unique_values_count_

    self.indepent_term_ : array-like of shape (n_classes,)
        Independent term computed at fitting time. It includes the smoothing factor to be applied to the prediction and 
        the apriori probability.

    probabilities_ : array-like of shape (column_count,~)
        Array where `feature_values_count_per_element_[i]` is an array  of shape (where `feature_values_count_per_element_[i][j]`
        contains the count of the jth value for the ith feature. Assuming ordinal encoding, some values might be equal to 0
    """
    def __init__(self,
                 alpha=1.0,
                 encode_data=True,
                 n_intervals=5,
                 discretize=True,
                 metric="accuracy"):
        self.alpha = alpha
        self.encode_data = encode_data
        self.n_intervals = n_intervals
        self.discretize = discretize
        self.metric = metric
        self._get_scorer()
        super().__init__()

    def _get_scorer(self):
        self.scorer = get_scorer(self.metric)
        if self.metric == "f1_score":  # Unseen values for target class may cause errors
            self.scorer = lambda y_true, y_pred: get_scorer(self.metric)(
                y_true=y_true, y_pred=y_pred, average="macro", zero_division=0)

    def set_params(self, **params):
        super().set_params(**params)
        self._get_scorer()

    def _compute_independent_terms(self):
        """Computes the terms that are indepent of the prediction"""
        self.total_probability_ = compute_total_probability_(
            self.class_count_, self.feature_unique_values_count_, self.alpha)
        # self.total_probability_ = compute_total_probability_(self.class_count_,self.feature_values_count_,self.alpha) #-->scikit uses this
        self.indepent_term_ = self.class_log_count_smoothed_ - self.total_probability_

    def _compute_class_counts(self, X: np.ndarray, y: np.ndarray):
        """Computes the counts for the priors"""
        self.n_classes_ = 1 + np.max(y)
        self.class_count_ = np.bincount(y)
        self.class_log_count_ = np.log(self.class_count_)
        self.class_count_smoothed_ = (self.class_count_ + self.alpha)
        self.class_log_count_smoothed_ = np.log(self.class_count_smoothed_)

    def _compute_feature_counts(self, X: np.ndarray, y: np.ndarray):
        """Computes the conditional smoothed counts for each feature"""
        tables = _get_tables(X, y, self.n_classes_, self.alpha)
        self.smoothed_counts_ = tables[0]
        self.smoothed_log_counts_ = tables[1]
        self.feature_values_count_ = tables[2]
        self.feature_values_count_per_element_ = tables[3]
        self.feature_unique_values_count_ = tables[4]

    def fit(self, X: np.ndarray, y: np.ndarray):
        """ Fits the classifier with trainning data.

        Parameters
        ----------

        X : array-like of shape (n_samples, n_features_)
            Training array that must be encoded unless
            encode_data is set to True

        y : array-like of shape (n_samples,)
            Label of the class associated to each sample.

        Returns
        -------
        self : object
        """
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals, discretize=self.discretize)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)
        if X.dtype != int:
            X = X.astype(int)
        if y.dtype != int:
            y = y.astype(int)
        self.n_samples_, self.n_features_ = X.shape
        self._compute_class_counts(X, y)
        self._compute_feature_counts(X, y)
        self._compute_independent_terms()
        return self

    def predict(self, X: np.ndarray):
        """ Predicts the label of the samples based on the MAP.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        Returns
        -------
        y : array-like of shape (n_samples)
            Predicted label for each sample.
        """
        check_is_fitted(self)
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        if X.dtype != int:
            X = X.astype(int)
        check_array(X)
        log_probabilities = _predict(X, self.smoothed_log_counts_,
                                     self.feature_values_count_, self.alpha)
        log_probabilities += self.indepent_term_
        output = np.argmax(log_probabilities, axis=1)
        if self.encode_data:
            output = self.class_encoder_.inverse_transform(output)
        return output

    def predict_proba(self, X: np.ndarray):
        """ Predicts the probability for each label of the samples based on the MAP.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        Returns
        -------
        y : array-like of shape (n_classes,n_samples)
            Array where `y[i][j]` contains the MAP of the jth class for ith
            sample
        """
        check_is_fitted(self)
        if X.dtype != int:
            X = X.astype(int)
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        log_probabilities = _predict(X, self.smoothed_log_counts_,
                                     self.feature_values_count_, self.alpha)
        log_probabilities += self.indepent_term_
        log_prob_x = logsumexp(log_probabilities, axis=1)
        return np.exp(log_probabilities - np.atleast_2d(log_prob_x).T)

    def leave_one_out_cross_val(self, X, y, fit=True):
        """Efficient LOO computation"""
        if fit:
            self.fit(X, y)
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
            y = self.class_encoder_.transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()

        if X.dtype != int:
            X = X.astype(int)
        if y.dtype != int:
            X = X.astype(int)
        log_alpha = np.log(self.alpha)
        log_proba = np.zeros((X.shape[0], self.n_classes_))
        for i in range(X.shape[0]):
            example, label = X[i], y[i]
            class_count_ = self.class_count_.copy()
            class_count_[label] -= 1
            log_proba[i] = np.log(class_count_ + self.alpha)
            for j in range(X.shape[1]):
                p = self.smoothed_log_counts_[j][example[j]].copy()
                p[label] = np.log(
                    np.max([
                        self.smoothed_counts_[j][example[j]][label] - 1,
                        self.alpha
                    ]))
                log_proba[i] += p
                if self.feature_values_count_per_element_[j][example[j]] == 1:
                    update_value = np.log(class_count_ + (
                        self.feature_unique_values_count_[j] - 1) * self.alpha)
                else:
                    update_value = np.log(
                        class_count_ +
                        (self.feature_unique_values_count_[j]) * self.alpha)
                log_proba[i] -= np.where(update_value == np.NINF, 0,
                                         update_value)
        y_pred = np.argmax(log_proba, axis=1)
        return self.scorer(y, y_pred)

    def add_features(self, X, y, index=None):
        """Updates classifier with new features

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        y : array-like of shape (n_samples,)
            Label of the class associated to each sample.

        index: {None,array-like of shape (X.shape[1])}
                Indicates where to insert each new feature, if it is None
                they are all appended at the very end.
        Returns
        -------
        self : object
        """
        check_is_fitted(self)
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            # y should be the same than the one that was first fitted for now  ----> FUTURE IMPLEMENTATION
            y = self.class_encoder_.transform(y)
            X = self.feature_encoder_.add_features(X,
                                                   transform=True,
                                                   index=index)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)
        if X.dtype != int:
            X = X.astype(int)
        if y.dtype != int:
            X = X.astype(int)

        self.n_features_ += X.shape[1]
        tables = _get_tables(X, y, self.n_classes_, self.alpha)
        new_smoothed_counts = tables[0]
        new_smoothed_log_counts = tables[1]
        new_feature_value_counts = tables[2]
        new_feature_value_counts_per_element = tables[3]
        new_feature_unique_values_count_ = tables[4]
        new_feature_contribution = compute_total_probability_(
            self.class_count_, new_feature_unique_values_count_, self.alpha)
        if index:
            sort_index = np.argsort(index)
            index_with_column = list(enumerate(index))
            for i in sort_index:
                column, list_insert_index = index_with_column[i]
                self.feature_values_count_per_element_.insert(
                    list_insert_index,
                    new_feature_value_counts_per_element[column])
                self.feature_values_count_ = np.insert(
                    self.feature_values_count_, list_insert_index,
                    new_feature_value_counts[column])
                self.smoothed_counts_.insert(list_insert_index,
                                             new_smoothed_counts[column])
                self.smoothed_log_counts_.insert(
                    list_insert_index, new_smoothed_log_counts[column])
                self.feature_unique_values_count_ = np.insert(
                    self.feature_unique_values_count_, list_insert_index,
                    new_feature_unique_values_count_[column])
        else:
            self.feature_values_count_per_element_.extend(
                new_feature_value_counts_per_element)
            self.feature_values_count_ = np.concatenate(
                [self.feature_values_count_, new_feature_value_counts])
            self.smoothed_counts_.extend(new_smoothed_counts)
            self.smoothed_log_counts_.extend(new_smoothed_log_counts)
            self.feature_unique_values_count_ = np.concatenate([
                self.feature_unique_values_count_,
                new_feature_unique_values_count_
            ])

        self.total_probability_ += new_feature_contribution
        self.indepent_term_ -= new_feature_contribution

        return self

    def remove_feature(self, index):
        """Updates classifierby removing one feature (index)"""
        check_is_fitted(self)
        if self.n_features_ <= 1:
            raise Exception("Cannot remove only feature from classifier")
        if not 0 <= index < self.n_features_:
            raise Exception(
                f"Feature index not valid, expected index between 0 and {self.n_features_}"
            )
        self.n_features_ -= 1

        feature_contribution = self.class_count_ + self.alpha * self.feature_unique_values_count_[
            index]
        feature_contribution = np.log(feature_contribution)
        self.total_probability_ -= feature_contribution
        self.indepent_term_ += feature_contribution

        self.feature_unique_values_count_ = np.delete(
            self.feature_unique_values_count_, index)
        self.feature_values_count_ = np.delete(self.feature_values_count_,
                                               index)
        del self.feature_values_count_per_element_[index]
        del self.smoothed_counts_[index]
        del self.smoothed_log_counts_[index]

        if self.encode_data:
            self.feature_encoder_.remove_feature(index)
        return self

    def score(self, X: np.ndarray, y: np.ndarray):
        """Computes the accuracy

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        y : array-like of shape (n_samples,)
            Label of the class associated to each sample.
        Returns
        -------
        score : float
                Percentage of correctly classified instances
        """
        y_pred = self.predict(X)
        return self.scorer(y, y_pred)