Beispiel #1
0
def test_remove_feature():

    X, y = make_classification(n_samples=1000,
                               n_features=100,
                               n_informative=2,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=2,
                               n_clusters_per_class=1,
                               weights=None,
                               class_sep=1.0,
                               hypercube=True,
                               scale=2.0,
                               shuffle=True,
                               random_state=0)
    nb = CustomNaiveBayes(encode_data=True)
    nb.fit(X, y)
    nb.remove_feature(0)
    independent = nb.indepent_term_
    smoothed_log_counts_ = nb.smoothed_log_counts_
    removed = nb.predict_proba(np.delete(X, 0, axis=1))
    nb.fit(np.delete(X, 0, axis=1), y)
    og = nb.predict_proba(np.delete(X, 0, axis=1))
    assert np.allclose(nb.smoothed_log_counts_, smoothed_log_counts_)
    assert np.allclose(nb.indepent_term_, independent)
    assert np.allclose(og, removed)
Beispiel #2
0
class RankerLogicalFeatureConstructor(TransformerMixin, ClassifierMixin,
                                      BaseEstimator):
    """First proposal: Hybrid-Ranker Wrapper.

    Build a ranking based on Symmetrical Uncertainty (SU) of every possible logical feature of depth 1
    (1 operator, 2 operands), using XOR, AND and OR operator. The steps are:
        - Find out combinations of values in database of every pair of features Xi, Xj:
            - Example: 
                - Xi = [1,2,3,2]
                - Xj = ['a','b','c','a']
                Possible combinations:
                    [(1,'a'),(2,'b'),(3,'c'),(2,'a')]
        - Apply operator to every combination:
            - Example: 
                - Xi = [1,2,3,2]
                - Xj = ['a','b','c','a']
                Possible combinations:
                    [(1,'a','AND'),(2,'b','AND'),(3,'c','AND'),(2,'a','AND'),
                    (1,'a','OR'),(2,'b','OR'),(3,'c','OR'),(2,'a','OR'),
                    (1,'a','XOR'),(2,'b','XOR'),(3,'c','XOR'),(2,'a','XOR')]
        - Add original variables to the list
        - Evaluate SU for every value in the list, and rank them
        - Go over the list following one of the two strategies proposed and evaluate 
          the subset based on a leave-one-out cross-validation with the NaiveBayes classifier.

    Parameters
    ----------
    strategy : str {eager,skip}
        After the ranking is built if the eager strategy is chosen we stop considering attributes
        when there is no improvement from one iteration to the next

    block_size : int, default=1
        Number of features that are added in each iteration

    encode_data : boolean
        Whether or not to encode the received data. If set to false the classifier 
        expects data to be encoded with an ordinal encoder.

    verbose : {boolean,int}
        If set to true it displays information of the remaining time 
        and inside variables.

    operators : array-like, deafult = ("XOR","AND","OR")
        Operators used for the constructed features.

    max_features : int, deafult = inf
        Maximum number of features to include in the selected subset

    max_iterations : int, deafult = inf
        Maximum number of iterations in the wrapper step.

    use_graph : bool, default = False 
        Generate Ranking from features obtained from the pruned-graph of the ACO algorithm.
        (Experimentation not carried out)

    use_initials: bool, default = False
        Force the set of initial features in the final solution. The set if trimmed with a backward elimination before-hand.

    Attributes
    ----------
    feature_encoder_ : CustomOrdinalFeatureEncoder or None
        Encodes data in ordinal way with unseen values handling if encode_data is set to True.

    class_encoder_ : LabelEncoder or None
        Encodes Data in ordinal way for the class if encode_data is set to True.

    all_feature_constructors: array-like
        List of FeatureConstructor objects with all the possible logical 
        features

    symmetrical_uncertainty_rank: array-like
        SU for every feature in all_feature_constructors

    rank : array-like
        Array of indexes corresponding to the sorted SU rank (in descending order).

    final_feature_constructors:
        Selected feature subset (list of constructors)

    classifier: NaiveBayes
        Classifier used in the wrapper and to perform predictions after fitting.

    """
    def __init__(self,
                 strategy="eager",
                 block_size=10,
                 encode_data=True,
                 n_intervals=5,
                 verbose=0,
                 operators=("AND", "OR", "XOR"),
                 max_features=float("inf"),
                 max_iterations=float("inf"),
                 metric="accuracy",
                 use_initials=False,
                 max_err=0,
                 prune=None,
                 use_graph=False):
        self.strategy = strategy
        self.block_size = max(block_size, 1)
        self.encode_data = encode_data
        self.verbose = verbose
        self.operators = operators
        self.max_features = max_features
        self.max_iterations = max_iterations
        self.n_intervals = n_intervals
        self.metric = metric
        self.max_err = max_err
        self.use_initials = use_initials
        self.prune = prune
        self.use_graph = use_graph

        allowed_strategies = ("eager", "skip")
        if self.strategy not in allowed_strategies:
            raise ValueError("Unknown operator type: %s, expected one of %s." %
                             (self.strategy, allowed_strategies))

    def fit(self, X, y):
        # Parse input
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)

        # Reset the stored results for new fit
        self.reset_evaluation()

        # Generate rank
        if self.use_graph:
            # Construct the minimum graph and create rank
            graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph(
                X, y, ("AND", "OR", "XOR"))
            self.all_feature_constructors = graph.get_rank()
        elif self.prune is not None:
            # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y)
            feature_combinations = list(
                combinations(list(range(X.shape[1])),
                             2)) + [(i, i) for i in range(X.shape[1])]
            rank_pairs = [
                symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y)
                for i, j in feature_combinations
            ]
            rank_pairs_index = np.argsort(rank_pairs)[::-1]

            # Create the unsorted list
            self.all_feature_constructors = []
            for index in rank_pairs_index[:self.prune]:
                i, j = feature_combinations[index]
                if i == j:
                    from tfg.feature_construction import create_feature
                    self.all_feature_constructors.extend([
                        create_feature("OR", [(i, n), (i, m)])
                        for n, m in combinations(np.unique(X[:, i]), 2)
                    ])
                else:
                    self.all_feature_constructors.extend(
                        construct_features(X[:, [i, j]],
                                           operators=self.operators,
                                           same_feature=False))
        else:
            # Create the unsorted list of all features
            self.all_feature_constructors = construct_features(
                X, operators=self.operators)
        if self.verbose:
            print(
                f"Total number of constructed features: {len(self.all_feature_constructors)}"
            )
        self.all_feature_constructors.extend(
            [DummyFeatureConstructor(j) for j in range(X.shape[1])])
        self.symmetrical_uncertainty_rank = []

        # Sort the ranking
        for feature_constructor in self.all_feature_constructors:
            feature = feature_constructor.transform(X)
            su = symmetrical_uncertainty(f1=feature.flatten(), f2=y)
            self.symmetrical_uncertainty_rank.append(su)

        # Store the descending order index
        self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1]

        # If the initial variables are
        if self.use_initials:
            classifier = NaiveBayes(encode_data=False,
                                    n_intervals=self.n_intervals,
                                    metric=self.metric)
            classifier.fit(X, y)
            current_features = [
                DummyFeatureConstructor(j) for j in range(X.shape[1])
            ]

            # Store the backward result to reuse it for other executions
            self.initial_backward_features = backward_search(
                X, y, current_features, classifier)

        # Feature Subset Selection (FSS) from the rank
        self.filter_features(X, y)
        return self

    def predict(self, X):
        X, _ = self.transform(X)
        if self.encode_data:
            return self.class_encoder_.inverse_transform(
                self.classifier.predict(X))
        return self.classifier.predict(X)

    def reset_evaluation(self):
        # Reset the memoize evaluations
        self.evaluate_leave_one_out_cross_val = memoize(evaluate_leave_one_out)

    def predict_proba(self, X):
        X, _ = self.transform(X)
        return self.classifier.predict_proba(X)

    def score(self, X, y):
        X, y = self.transform(X, y)
        return self.classifier.score(X, y)

    def filter_features(self, X, y):
        '''After the rank is built this perform the greedy wrapper search'''
        check_is_fitted(self)
        self.classifier = NaiveBayes(encode_data=False,
                                     n_intervals=self.n_intervals,
                                     metric=self.metric)
        current_score = np.NINF
        first_iteration = True
        current_features = []
        current_data = None
        if self.use_initials:
            # Original Features have already been taken into account
            rank_iter = filter(
                lambda x: not isinstance(self.all_feature_constructors[x],
                                         DummyFeatureConstructor),
                iter(self.rank))

            # Deep copy to avoid issues when modifying the list
            current_features = deepcopy(self.initial_backward_features)
            current_data = np.concatenate(
                [f.transform(X) for f in current_features], axis=1)

            # Get initial LOO score
            current_score = self.evaluate_leave_one_out_cross_val(
                self.classifier, current_features, current_data, y, fit=True)
        else:
            # Iterator over the sorted list of indexes
            rank_iter = iter(self.rank)

        if self.verbose:
            progress_bar = tqdm(total=len(self.rank),
                                bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        iteration = 0
        iterations_without_improvements = 0

        # Loop for including {block size} elements at a time
        # Rank is an iterator, so the for loop is not sequential!
        for feature_constructor_index in rank_iter:
            iteration += 1
            if self.verbose:
                progress_bar.set_postfix({
                    "n_features": len(current_features),
                    "score": current_score
                })
                progress_bar.update(1)
                progress_bar.refresh()

            # Add block size features
            new_X = [
                self.all_feature_constructors[feature_constructor_index].
                transform(X)
            ]
            selected_features = [
                self.all_feature_constructors[feature_constructor_index]
            ]
            for _ in range(self.block_size - 1):
                try:
                    index = next(rank_iter)
                    selected_features.append(
                        self.all_feature_constructors[index])
                    new_X.append(
                        self.all_feature_constructors[index].transform(X))
                    if self.verbose:
                        progress_bar.update(1)
                        progress_bar.refresh()
                except:
                    # Block size does not divide the number of elements in the rank. The search is halted
                    break

            # Evaluate features
            new_X = np.concatenate(new_X, axis=1)
            if iteration == 1 and not self.use_initials:
                current_data = new_X
                current_score = self.evaluate_leave_one_out_cross_val(
                    self.classifier,
                    selected_features,
                    current_data,
                    y,
                    fit=True)
                current_features = selected_features
                first_iteration = False
                if self.max_iterations <= iteration or (
                        len(current_features) +
                        self.block_size) > self.max_features:
                    break
                continue
            data = np.concatenate([current_data, new_X], axis=1)
            self.classifier.add_features(new_X, y)
            # LOO evaluation
            score = self.evaluate_leave_one_out_cross_val(self.classifier,
                                                          current_features +
                                                          selected_features,
                                                          data,
                                                          y,
                                                          fit=False)
            if score > current_score:
                current_score = score
                current_data = data
                current_features.extend(selected_features)
                iterations_without_improvements = 0
            else:
                iterations_without_improvements += 1
                # Remove last added block
                for feature_index_to_remove in range(
                        data.shape[1], data.shape[1] - new_X.shape[1], -1):
                    self.classifier.remove_feature(feature_index_to_remove - 1)
                if self.strategy == "eager" and self.max_err < iterations_without_improvements:
                    # Stops as soon as no impovement
                    break

            if self.max_iterations <= iteration or (
                    len(current_features) +
                    self.block_size) > self.max_features:
                break
        if self.verbose:
            progress_bar.close()
            print(
                f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}"
            )
        self.final_feature_constructors = current_features
        return self

    def transform(self, X, y=None):
        check_is_fitted(self)
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
            if y is not None:
                y = self.class_encoder_.transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        new_X = []
        for feature_constructor in self.final_feature_constructors:
            new_X.append(feature_constructor.transform(X))
        return np.concatenate(new_X, axis=1), y
Beispiel #3
0
class PazzaniWrapperNB(PazzaniWrapper):
    ''''Optimized version of Pazzani's wrapper for the Naive Bayes classifier.
        LOO cross validation
        Update, add, delete features
    '''
    def __init__(self, seed=None, strategy="BSEJ", verbose=0):
        super().__init__(seed=seed,
                         strategy=strategy,
                         verbose=verbose,
                         cv=None)

    def _generate_neighbors_bsej(self, current_columns, X):
        if X.shape[1] > 1:
            for column_to_drop in range(X.shape[1]):
                new_columns = current_columns.copy()
                del new_columns[column_to_drop]
                yield new_columns, column_to_drop, None, True  # Updated column list, columns to remove, columns to add, delete?
            for features in combinations(np.arange(X.shape[1]), 2):
                new_col_name = flatten([
                    current_columns[features[0]], current_columns[features[1]]
                ])
                new_columns = current_columns.copy()
                new_columns.append(tuple(new_col_name))
                columns_to_drop = sorted(features, reverse=True)
                del new_columns[columns_to_drop[0]]
                del new_columns[columns_to_drop[1]]

                combined_columns = combine_columns(X, list(features))
                yield new_columns, list(
                    columns_to_drop), combined_columns, False

    def fit_bsej(self, X, y):
        self.evaluate = memoize(_evaluate, attribute_to_cache="columns")
        current_best = X.copy()
        current_columns = deque(range(X.shape[1]))
        best_score = self.evaluate(self.classifier,
                                   current_best,
                                   y,
                                   columns=current_columns,
                                   fit=True)
        stop = False
        while not stop:
            update = False
            stop = True
            if self.verbose:
                print("Current Best: ", current_columns, " Score: ",
                      best_score)
            for new_columns, columns_to_delete, columns_to_add, delete in self._generate_neighbors_bsej(
                    current_columns, current_best):
                if delete:
                    action = "DELETE"
                    # Update classifier and get validation result
                    self.classifier.remove_feature(columns_to_delete)
                    neighbor = np.delete(current_best,
                                         columns_to_delete,
                                         axis=1)
                    score = self.evaluate(self.classifier,
                                          neighbor,
                                          y,
                                          columns=new_columns,
                                          fit=False)

                    # Restore the column for the next iteration
                    self.classifier.add_features(
                        current_best[:, columns_to_delete].reshape(-1, 1),
                        y,
                        index=[columns_to_delete])
                else:
                    action = "ADD"
                    self.classifier.add_features(columns_to_add, y)
                    self.classifier.remove_feature(columns_to_delete[0])
                    self.classifier.remove_feature(columns_to_delete[1])

                    neighbor = np.delete(current_best,
                                         columns_to_delete,
                                         axis=1)
                    neighbor = np.concatenate([neighbor, columns_to_add],
                                              axis=1)

                    score = self.evaluate(self.classifier,
                                          neighbor,
                                          y,
                                          columns=new_columns,
                                          fit=False)
                    if self.classifier.n_features_ == 1:
                        # We reverse it for insert order
                        self.classifier.add_features(
                            current_best[:, columns_to_delete], y)
                        self.classifier.remove_feature(0)
                    else:
                        self.classifier.remove_feature(neighbor.shape[1] - 1)
                        # We reverse it for insert order
                        self.classifier.add_features(
                            current_best[:, columns_to_delete],
                            y,
                            index=columns_to_delete)

                if self.verbose == 2:
                    print("\tNeighbor: ", new_columns, " Score: ", score)
                if score > best_score:
                    stop = False
                    best_columns = new_columns
                    best_action = action
                    best_score = score
                    best_columns_to_delete = columns_to_delete
                    update = True
                    if best_action == "ADD":
                        best_columns_to_add = columns_to_add
                    if score == 1.0:
                        stop = True
                        break
            if update:
                current_columns = best_columns
                if best_action == "DELETE":
                    current_best = np.delete(current_best,
                                             best_columns_to_delete,
                                             axis=1)
                    # Update best
                    self.classifier.remove_feature(best_columns_to_delete)
                else:
                    current_best = np.delete(current_best,
                                             best_columns_to_delete,
                                             axis=1)
                    current_best = np.concatenate(
                        [current_best, best_columns_to_add], axis=1)
                    # Update classifier
                    self.classifier.add_features(best_columns_to_add, y)
                    self.classifier.remove_feature(best_columns_to_delete[0])
                    self.classifier.remove_feature(best_columns_to_delete[1])

        if self.verbose:
            print("Final best: ", list(current_columns), " Score: ",
                  best_score)
        self.features_ = current_columns
        self.feature_transformer = lambda X: join_columns(
            X, columns=self.features_)
        model = self.classifier.fit(self.feature_transformer(X), y)
        return self

    def _generate_neighbors_fssj(self, current_columns, individual,
                                 original_data, available_columns):
        if available_columns:
            for index, col in enumerate(available_columns):
                new_columns = current_columns.copy()
                new_columns.append(col)
                new_available_columns = available_columns.copy()
                del new_available_columns[index]
                column_to_add = original_data[:, col].reshape(-1, 1)
                column_to_delete = None
                # New columns, Availables,ColumnToDelete,ColumnToAdd,Delete?
                yield new_columns, new_available_columns, column_to_delete, column_to_add, False
        if individual is not None and individual.shape[
                1] > 0 and available_columns:
            for features_index in product(np.arange(len(available_columns)),
                                          np.arange(len(current_columns))):
                features = available_columns[
                    features_index[0]], current_columns[features_index[1]]
                new_col_name = flatten([features[0], features[1]])

                new_available_columns = available_columns.copy()
                del new_available_columns[features_index[0]]

                new_columns = current_columns.copy()
                new_columns.append(tuple(new_col_name))
                del new_columns[features_index[1]]

                separated_columns = np.concatenate([
                    original_data[:, features[0]].reshape(-1, 1),
                    individual[:, features_index[1]].reshape(-1, 1)
                ],
                                                   axis=1)
                if isinstance(features[1], tuple):
                    features = list(features)
                    features[1] = list(features[1])
                    features = tuple(features)
                column_to_delete = features_index[1]
                combined_columns = combine_columns(separated_columns)
                column_to_add = combined_columns
                yield new_columns, new_available_columns, column_to_delete, column_to_add, True

    def fit_fssj(self, X, y):
        self.evaluate = memoize(_evaluate, attribute_to_cache="columns")
        current_best = None
        current_columns = deque()
        available_columns = list(range(X.shape[1]))
        best_score = -float("inf")
        stop = False
        while not stop:
            update = False
            stop = True
            # self.classifier.encode_data=True
            if self.verbose:
                print("Current Best: ", current_columns, " Score: ",
                      best_score, "Available columns: ", available_columns)
            for new_columns, new_available_columns, column_to_delete, column_to_add, delete in self._generate_neighbors_fssj(
                    current_columns=current_columns,
                    individual=current_best,
                    original_data=X,
                    available_columns=available_columns):
                if delete:
                    action = "JOIN"
                    # Update classifier and get validation result
                    self.classifier.add_features(column_to_add, y)
                    self.classifier.remove_feature(column_to_delete)

                    neighbor = np.concatenate([current_best, column_to_add],
                                              axis=1)
                    neighbor = np.delete(neighbor, column_to_delete, axis=1)
                    score = self.evaluate(self.classifier,
                                          neighbor,
                                          y,
                                          columns=new_columns,
                                          fit=False)

                    # Restore the column for the next iteration
                    if neighbor.shape[1] == 1:
                        self.classifier.fit(current_best, y)
                    else:
                        self.classifier.remove_feature(neighbor.shape[1] - 1)
                        self.classifier.add_features(
                            current_best[:, column_to_delete].reshape(-1, 1),
                            y,
                            index=[column_to_delete])

                else:
                    action = "ADD"
                    if current_best is None:
                        neighbor = column_to_add
                        self.classifier.fit(neighbor, y)
                    else:
                        neighbor = np.concatenate(
                            [current_best, column_to_add], axis=1)
                        self.classifier.add_features(column_to_add, y)

                    score = self.evaluate(self.classifier,
                                          neighbor,
                                          y,
                                          columns=new_columns,
                                          fit=False)

                    if current_best is None:
                        self.classifier = NaiveBayes(encode_data=True)
                    else:
                        self.classifier.remove_feature(neighbor.shape[1] - 1)

                if self.verbose == 2:
                    print("\tNeighbour: ", new_columns, " Score: ", score,
                          "Available columns: ", new_available_columns)

                if score > best_score:
                    stop = False
                    best_columns = new_columns
                    best_available_columns = new_available_columns
                    best_action = action
                    best_score = score
                    best_column_to_delete = column_to_delete
                    best_column_to_add = column_to_add
                    update = True
                    if score == 1.0:
                        stop = True
                        break
            if update:
                current_columns = best_columns
                available_columns = best_available_columns
                if best_action == "JOIN":
                    self.classifier.add_features(best_column_to_add, y)
                    self.classifier.remove_feature(best_column_to_delete)

                    current_best = np.concatenate(
                        [current_best, best_column_to_add], axis=1)
                    current_best = np.delete(current_best,
                                             best_column_to_delete,
                                             axis=1)
                else:
                    if current_best is None:
                        current_best = best_column_to_add
                        self.classifier.fit(current_best, y)
                    else:
                        current_best = np.concatenate(
                            [current_best, best_column_to_add], axis=1)
                        self.classifier.add_features(best_column_to_add, y)

        if self.verbose:
            print("Final best: ", list(current_columns), " Score: ",
                  best_score)
        self.features_ = current_columns
        self.feature_transformer = lambda X: join_columns(
            X, columns=self.features_)
        model = self.classifier.fit(self.feature_transformer(X), y)
        return self

    def evaluate(self, classifier, X, y, fit=True, columns=None):
        return _evaluate(classifier, X, y, fit=True, columns=None)