def test_add_features_with_index(): X, y = make_classification(n_samples=1000, n_features=100, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, class_sep=1.0, hypercube=True, scale=2.0, shuffle=True, random_state=0) X_og = X.copy() index = [0, 8, 9, 20] X_two_less = np.delete(X_og, index, axis=1) nb = CustomNaiveBayes(encode_data=True) nb.fit(X_two_less, y) nb.add_features(X_og[:, index], y, index=index) independent = nb.indepent_term_ smoothed_log_counts_ = nb.smoothed_log_counts_ added = nb.predict_proba(X) nb.fit(X, y) og = nb.predict_proba(X) assert np.allclose(nb.indepent_term_, independent) assert np.allclose(nb.smoothed_log_counts_, smoothed_log_counts_) assert np.allclose(og, added)
class PazzaniWrapperNB(PazzaniWrapper): ''''Optimized version of Pazzani's wrapper for the Naive Bayes classifier. LOO cross validation Update, add, delete features ''' def __init__(self, seed=None, strategy="BSEJ", verbose=0): super().__init__(seed=seed, strategy=strategy, verbose=verbose, cv=None) def _generate_neighbors_bsej(self, current_columns, X): if X.shape[1] > 1: for column_to_drop in range(X.shape[1]): new_columns = current_columns.copy() del new_columns[column_to_drop] yield new_columns, column_to_drop, None, True # Updated column list, columns to remove, columns to add, delete? for features in combinations(np.arange(X.shape[1]), 2): new_col_name = flatten([ current_columns[features[0]], current_columns[features[1]] ]) new_columns = current_columns.copy() new_columns.append(tuple(new_col_name)) columns_to_drop = sorted(features, reverse=True) del new_columns[columns_to_drop[0]] del new_columns[columns_to_drop[1]] combined_columns = combine_columns(X, list(features)) yield new_columns, list( columns_to_drop), combined_columns, False def fit_bsej(self, X, y): self.evaluate = memoize(_evaluate, attribute_to_cache="columns") current_best = X.copy() current_columns = deque(range(X.shape[1])) best_score = self.evaluate(self.classifier, current_best, y, columns=current_columns, fit=True) stop = False while not stop: update = False stop = True if self.verbose: print("Current Best: ", current_columns, " Score: ", best_score) for new_columns, columns_to_delete, columns_to_add, delete in self._generate_neighbors_bsej( current_columns, current_best): if delete: action = "DELETE" # Update classifier and get validation result self.classifier.remove_feature(columns_to_delete) neighbor = np.delete(current_best, columns_to_delete, axis=1) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) # Restore the column for the next iteration self.classifier.add_features( current_best[:, columns_to_delete].reshape(-1, 1), y, index=[columns_to_delete]) else: action = "ADD" self.classifier.add_features(columns_to_add, y) self.classifier.remove_feature(columns_to_delete[0]) self.classifier.remove_feature(columns_to_delete[1]) neighbor = np.delete(current_best, columns_to_delete, axis=1) neighbor = np.concatenate([neighbor, columns_to_add], axis=1) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) if self.classifier.n_features_ == 1: # We reverse it for insert order self.classifier.add_features( current_best[:, columns_to_delete], y) self.classifier.remove_feature(0) else: self.classifier.remove_feature(neighbor.shape[1] - 1) # We reverse it for insert order self.classifier.add_features( current_best[:, columns_to_delete], y, index=columns_to_delete) if self.verbose == 2: print("\tNeighbor: ", new_columns, " Score: ", score) if score > best_score: stop = False best_columns = new_columns best_action = action best_score = score best_columns_to_delete = columns_to_delete update = True if best_action == "ADD": best_columns_to_add = columns_to_add if score == 1.0: stop = True break if update: current_columns = best_columns if best_action == "DELETE": current_best = np.delete(current_best, best_columns_to_delete, axis=1) # Update best self.classifier.remove_feature(best_columns_to_delete) else: current_best = np.delete(current_best, best_columns_to_delete, axis=1) current_best = np.concatenate( [current_best, best_columns_to_add], axis=1) # Update classifier self.classifier.add_features(best_columns_to_add, y) self.classifier.remove_feature(best_columns_to_delete[0]) self.classifier.remove_feature(best_columns_to_delete[1]) if self.verbose: print("Final best: ", list(current_columns), " Score: ", best_score) self.features_ = current_columns self.feature_transformer = lambda X: join_columns( X, columns=self.features_) model = self.classifier.fit(self.feature_transformer(X), y) return self def _generate_neighbors_fssj(self, current_columns, individual, original_data, available_columns): if available_columns: for index, col in enumerate(available_columns): new_columns = current_columns.copy() new_columns.append(col) new_available_columns = available_columns.copy() del new_available_columns[index] column_to_add = original_data[:, col].reshape(-1, 1) column_to_delete = None # New columns, Availables,ColumnToDelete,ColumnToAdd,Delete? yield new_columns, new_available_columns, column_to_delete, column_to_add, False if individual is not None and individual.shape[ 1] > 0 and available_columns: for features_index in product(np.arange(len(available_columns)), np.arange(len(current_columns))): features = available_columns[ features_index[0]], current_columns[features_index[1]] new_col_name = flatten([features[0], features[1]]) new_available_columns = available_columns.copy() del new_available_columns[features_index[0]] new_columns = current_columns.copy() new_columns.append(tuple(new_col_name)) del new_columns[features_index[1]] separated_columns = np.concatenate([ original_data[:, features[0]].reshape(-1, 1), individual[:, features_index[1]].reshape(-1, 1) ], axis=1) if isinstance(features[1], tuple): features = list(features) features[1] = list(features[1]) features = tuple(features) column_to_delete = features_index[1] combined_columns = combine_columns(separated_columns) column_to_add = combined_columns yield new_columns, new_available_columns, column_to_delete, column_to_add, True def fit_fssj(self, X, y): self.evaluate = memoize(_evaluate, attribute_to_cache="columns") current_best = None current_columns = deque() available_columns = list(range(X.shape[1])) best_score = -float("inf") stop = False while not stop: update = False stop = True # self.classifier.encode_data=True if self.verbose: print("Current Best: ", current_columns, " Score: ", best_score, "Available columns: ", available_columns) for new_columns, new_available_columns, column_to_delete, column_to_add, delete in self._generate_neighbors_fssj( current_columns=current_columns, individual=current_best, original_data=X, available_columns=available_columns): if delete: action = "JOIN" # Update classifier and get validation result self.classifier.add_features(column_to_add, y) self.classifier.remove_feature(column_to_delete) neighbor = np.concatenate([current_best, column_to_add], axis=1) neighbor = np.delete(neighbor, column_to_delete, axis=1) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) # Restore the column for the next iteration if neighbor.shape[1] == 1: self.classifier.fit(current_best, y) else: self.classifier.remove_feature(neighbor.shape[1] - 1) self.classifier.add_features( current_best[:, column_to_delete].reshape(-1, 1), y, index=[column_to_delete]) else: action = "ADD" if current_best is None: neighbor = column_to_add self.classifier.fit(neighbor, y) else: neighbor = np.concatenate( [current_best, column_to_add], axis=1) self.classifier.add_features(column_to_add, y) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) if current_best is None: self.classifier = NaiveBayes(encode_data=True) else: self.classifier.remove_feature(neighbor.shape[1] - 1) if self.verbose == 2: print("\tNeighbour: ", new_columns, " Score: ", score, "Available columns: ", new_available_columns) if score > best_score: stop = False best_columns = new_columns best_available_columns = new_available_columns best_action = action best_score = score best_column_to_delete = column_to_delete best_column_to_add = column_to_add update = True if score == 1.0: stop = True break if update: current_columns = best_columns available_columns = best_available_columns if best_action == "JOIN": self.classifier.add_features(best_column_to_add, y) self.classifier.remove_feature(best_column_to_delete) current_best = np.concatenate( [current_best, best_column_to_add], axis=1) current_best = np.delete(current_best, best_column_to_delete, axis=1) else: if current_best is None: current_best = best_column_to_add self.classifier.fit(current_best, y) else: current_best = np.concatenate( [current_best, best_column_to_add], axis=1) self.classifier.add_features(best_column_to_add, y) if self.verbose: print("Final best: ", list(current_columns), " Score: ", best_score) self.features_ = current_columns self.feature_transformer = lambda X: join_columns( X, columns=self.features_) model = self.classifier.fit(self.feature_transformer(X), y) return self def evaluate(self, classifier, X, y, fit=True, columns=None): return _evaluate(classifier, X, y, fit=True, columns=None)
class RankerLogicalFeatureConstructor(TransformerMixin, ClassifierMixin, BaseEstimator): """First proposal: Hybrid-Ranker Wrapper. Build a ranking based on Symmetrical Uncertainty (SU) of every possible logical feature of depth 1 (1 operator, 2 operands), using XOR, AND and OR operator. The steps are: - Find out combinations of values in database of every pair of features Xi, Xj: - Example: - Xi = [1,2,3,2] - Xj = ['a','b','c','a'] Possible combinations: [(1,'a'),(2,'b'),(3,'c'),(2,'a')] - Apply operator to every combination: - Example: - Xi = [1,2,3,2] - Xj = ['a','b','c','a'] Possible combinations: [(1,'a','AND'),(2,'b','AND'),(3,'c','AND'),(2,'a','AND'), (1,'a','OR'),(2,'b','OR'),(3,'c','OR'),(2,'a','OR'), (1,'a','XOR'),(2,'b','XOR'),(3,'c','XOR'),(2,'a','XOR')] - Add original variables to the list - Evaluate SU for every value in the list, and rank them - Go over the list following one of the two strategies proposed and evaluate the subset based on a leave-one-out cross-validation with the NaiveBayes classifier. Parameters ---------- strategy : str {eager,skip} After the ranking is built if the eager strategy is chosen we stop considering attributes when there is no improvement from one iteration to the next block_size : int, default=1 Number of features that are added in each iteration encode_data : boolean Whether or not to encode the received data. If set to false the classifier expects data to be encoded with an ordinal encoder. verbose : {boolean,int} If set to true it displays information of the remaining time and inside variables. operators : array-like, deafult = ("XOR","AND","OR") Operators used for the constructed features. max_features : int, deafult = inf Maximum number of features to include in the selected subset max_iterations : int, deafult = inf Maximum number of iterations in the wrapper step. use_graph : bool, default = False Generate Ranking from features obtained from the pruned-graph of the ACO algorithm. (Experimentation not carried out) use_initials: bool, default = False Force the set of initial features in the final solution. The set if trimmed with a backward elimination before-hand. Attributes ---------- feature_encoder_ : CustomOrdinalFeatureEncoder or None Encodes data in ordinal way with unseen values handling if encode_data is set to True. class_encoder_ : LabelEncoder or None Encodes Data in ordinal way for the class if encode_data is set to True. all_feature_constructors: array-like List of FeatureConstructor objects with all the possible logical features symmetrical_uncertainty_rank: array-like SU for every feature in all_feature_constructors rank : array-like Array of indexes corresponding to the sorted SU rank (in descending order). final_feature_constructors: Selected feature subset (list of constructors) classifier: NaiveBayes Classifier used in the wrapper and to perform predictions after fitting. """ def __init__(self, strategy="eager", block_size=10, encode_data=True, n_intervals=5, verbose=0, operators=("AND", "OR", "XOR"), max_features=float("inf"), max_iterations=float("inf"), metric="accuracy", use_initials=False, max_err=0, prune=None, use_graph=False): self.strategy = strategy self.block_size = max(block_size, 1) self.encode_data = encode_data self.verbose = verbose self.operators = operators self.max_features = max_features self.max_iterations = max_iterations self.n_intervals = n_intervals self.metric = metric self.max_err = max_err self.use_initials = use_initials self.prune = prune self.use_graph = use_graph allowed_strategies = ("eager", "skip") if self.strategy not in allowed_strategies: raise ValueError("Unknown operator type: %s, expected one of %s." % (self.strategy, allowed_strategies)) def fit(self, X, y): # Parse input if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: self.feature_encoder_ = CustomOrdinalFeatureEncoder( n_intervals=self.n_intervals) self.class_encoder_ = CustomLabelEncoder() X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) # Reset the stored results for new fit self.reset_evaluation() # Generate rank if self.use_graph: # Construct the minimum graph and create rank graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph( X, y, ("AND", "OR", "XOR")) self.all_feature_constructors = graph.get_rank() elif self.prune is not None: # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y) feature_combinations = list( combinations(list(range(X.shape[1])), 2)) + [(i, i) for i in range(X.shape[1])] rank_pairs = [ symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y) for i, j in feature_combinations ] rank_pairs_index = np.argsort(rank_pairs)[::-1] # Create the unsorted list self.all_feature_constructors = [] for index in rank_pairs_index[:self.prune]: i, j = feature_combinations[index] if i == j: from tfg.feature_construction import create_feature self.all_feature_constructors.extend([ create_feature("OR", [(i, n), (i, m)]) for n, m in combinations(np.unique(X[:, i]), 2) ]) else: self.all_feature_constructors.extend( construct_features(X[:, [i, j]], operators=self.operators, same_feature=False)) else: # Create the unsorted list of all features self.all_feature_constructors = construct_features( X, operators=self.operators) if self.verbose: print( f"Total number of constructed features: {len(self.all_feature_constructors)}" ) self.all_feature_constructors.extend( [DummyFeatureConstructor(j) for j in range(X.shape[1])]) self.symmetrical_uncertainty_rank = [] # Sort the ranking for feature_constructor in self.all_feature_constructors: feature = feature_constructor.transform(X) su = symmetrical_uncertainty(f1=feature.flatten(), f2=y) self.symmetrical_uncertainty_rank.append(su) # Store the descending order index self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1] # If the initial variables are if self.use_initials: classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) classifier.fit(X, y) current_features = [ DummyFeatureConstructor(j) for j in range(X.shape[1]) ] # Store the backward result to reuse it for other executions self.initial_backward_features = backward_search( X, y, current_features, classifier) # Feature Subset Selection (FSS) from the rank self.filter_features(X, y) return self def predict(self, X): X, _ = self.transform(X) if self.encode_data: return self.class_encoder_.inverse_transform( self.classifier.predict(X)) return self.classifier.predict(X) def reset_evaluation(self): # Reset the memoize evaluations self.evaluate_leave_one_out_cross_val = memoize(evaluate_leave_one_out) def predict_proba(self, X): X, _ = self.transform(X) return self.classifier.predict_proba(X) def score(self, X, y): X, y = self.transform(X, y) return self.classifier.score(X, y) def filter_features(self, X, y): '''After the rank is built this perform the greedy wrapper search''' check_is_fitted(self) self.classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) current_score = np.NINF first_iteration = True current_features = [] current_data = None if self.use_initials: # Original Features have already been taken into account rank_iter = filter( lambda x: not isinstance(self.all_feature_constructors[x], DummyFeatureConstructor), iter(self.rank)) # Deep copy to avoid issues when modifying the list current_features = deepcopy(self.initial_backward_features) current_data = np.concatenate( [f.transform(X) for f in current_features], axis=1) # Get initial LOO score current_score = self.evaluate_leave_one_out_cross_val( self.classifier, current_features, current_data, y, fit=True) else: # Iterator over the sorted list of indexes rank_iter = iter(self.rank) if self.verbose: progress_bar = tqdm(total=len(self.rank), bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') iteration = 0 iterations_without_improvements = 0 # Loop for including {block size} elements at a time # Rank is an iterator, so the for loop is not sequential! for feature_constructor_index in rank_iter: iteration += 1 if self.verbose: progress_bar.set_postfix({ "n_features": len(current_features), "score": current_score }) progress_bar.update(1) progress_bar.refresh() # Add block size features new_X = [ self.all_feature_constructors[feature_constructor_index]. transform(X) ] selected_features = [ self.all_feature_constructors[feature_constructor_index] ] for _ in range(self.block_size - 1): try: index = next(rank_iter) selected_features.append( self.all_feature_constructors[index]) new_X.append( self.all_feature_constructors[index].transform(X)) if self.verbose: progress_bar.update(1) progress_bar.refresh() except: # Block size does not divide the number of elements in the rank. The search is halted break # Evaluate features new_X = np.concatenate(new_X, axis=1) if iteration == 1 and not self.use_initials: current_data = new_X current_score = self.evaluate_leave_one_out_cross_val( self.classifier, selected_features, current_data, y, fit=True) current_features = selected_features first_iteration = False if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break continue data = np.concatenate([current_data, new_X], axis=1) self.classifier.add_features(new_X, y) # LOO evaluation score = self.evaluate_leave_one_out_cross_val(self.classifier, current_features + selected_features, data, y, fit=False) if score > current_score: current_score = score current_data = data current_features.extend(selected_features) iterations_without_improvements = 0 else: iterations_without_improvements += 1 # Remove last added block for feature_index_to_remove in range( data.shape[1], data.shape[1] - new_X.shape[1], -1): self.classifier.remove_feature(feature_index_to_remove - 1) if self.strategy == "eager" and self.max_err < iterations_without_improvements: # Stops as soon as no impovement break if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break if self.verbose: progress_bar.close() print( f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}" ) self.final_feature_constructors = current_features return self def transform(self, X, y=None): check_is_fitted(self) if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: X = self.feature_encoder_.transform(X) if y is not None: y = self.class_encoder_.transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() new_X = [] for feature_constructor in self.final_feature_constructors: new_X.append(feature_constructor.transform(X)) return np.concatenate(new_X, axis=1), y
def explore(self, X, y, graph, random_generator, parallel, max_errors=0): ''' Search method that follows the following steps: 1. The initial node is connected to all the others (roulette wheel selection is performed) 2. There are 2 type of nodes (corresponding to an original feature (2.1) or corresponding to a value of a feature (2.2)): 2.1. If the selected node is an original feature we add it to the selected subset and go to step 3. 2.2. If the selected node is part of a logical feature then we select another node (the CONSTRUCTION step will not return full original features) 3. Compute the score 3.1. If it improves the previous one 3.1.1 Add the feature to the current subset 3.1.2 Update the score 3.1.3 Select another node (SELECTION step) 3.1.4 Go to step 2 3.2. If not, the exploration ends Note: Threading does not speed up the calculations as they are CPU bound and in python only I/O operations will benefit from this parallelism GPU improvement would reduce the time of the exploration. ''' self.step = math.ceil(math.log2(X.shape[1])) self.current_features = [] selected_nodes = set() constructed_nodes = set() classifier = NaiveBayes(encode_data=False, metric=self.metric) current_score = np.NINF score = 0 if self.use_initials: self.current_features = [ DummyFeatureConstructor(j) for j in range(X.shape[1]) ] classifier.fit(X, y) current_transformed_features_numpy = np.concatenate( [f.transform(X) for f in self.current_features], axis=1) score = self.evaluate_loo(self.current_features, classifier, current_transformed_features_numpy, y) current_score = score selected_nodes.update(graph.get_original_ids()) if len(self.current_features) == 0: current_transformed_features_numpy = None initial, pheromones, heuristics = graph.get_initial_nodes( selected_nodes) probabilities = self.compute_probability(pheromones, heuristics) index = self.choose_next(probabilities, random_generator) node_id, selected_node = initial[index] # SU variable contains the MIFS-SU for the selected variable current_su = 0 su = heuristics[index] is_fitted = self.use_initials feature_constructor = None n_errors = 0 number_steps = 1 while True: current_score = score if selected_node[1] is None: # Original Feature feature_constructor = DummyFeatureConstructor(selected_node[0]) selected_nodes.add(node_id) else: # Need to construct next feature and compute heuristic value for the feature to replace temporal su from half-var neighbours, pheromones = graph.get_neighbours( selected_node, constructed_nodes, step="CONSTRUCTION") if len(neighbours) == 0: break if self.beta != 0: if parallel: with concurrent.futures.ThreadPoolExecutor( ) as executor: futures = [] for neighbour in neighbours: futures.append( executor.submit( self.compute_neighbour_sufs, neighbour=neighbour, transformed_features= current_transformed_features_numpy, constructors=self.current_features, selected_node=selected_node, current_su=current_su, X=X, y=y)) concurrent.futures.wait( futures, timeout=None, return_when='ALL_COMPLETED') su = [future.result() for future in futures] else: su = [ self.compute_neighbour_sufs( neighbour=neighbour, transformed_features= current_transformed_features_numpy, selected_node=selected_node, constructors=self.current_features, current_su=current_su, X=X, y=y) for neighbour in neighbours ] else: #Avoid unnecessary evaluation su = np.ones(len(neighbours)) probabilities = self.compute_probability( pheromones, np.array(su)) index = self.choose_next(probabilities, random_generator) su = su[index] feature_constructor = create_feature( neighbours[index][2], [selected_node, neighbours[index][1]]) constructed_nodes.add( frozenset( (node_id, neighbours[index][0], neighbours[index][2]))) node_id, selected_node = neighbours[index][:2] # Assess new feature transformed_feature = feature_constructor.transform(X) if is_fitted: classifier.add_features(transformed_feature, y) else: classifier.fit(transformed_feature, y) is_fitted = True if current_transformed_features_numpy is None: current_transformed_features_numpy = transformed_feature else: current_transformed_features_numpy = append_column_to_numpy( current_transformed_features_numpy, transformed_feature) if number_steps >= self.step: score = self.evaluate_loo( self.current_features + [feature_constructor], classifier, current_transformed_features_numpy, y) if score <= current_score: if n_errors >= max_errors: break else: n_errors += 1 else: n_errors = 0 number_steps = 0 else: number_steps += 1 current_su = su self.current_features.append(feature_constructor) current_score = score # Select next neighbours, pheromones = graph.get_neighbours(selected_node, selected_nodes, step="SELECTION") # Compute heuristic su = [] if len(neighbours) == 0: break if self.beta != 0: for neighbour, pheromone in zip(neighbours, pheromones): if neighbour[1][1] is None: # Original variable su.append( self.compute_sufs_cached( current_su, current_transformed_features_numpy, X[:, neighbour[1][0]], self.current_features, DummyFeatureConstructor(neighbour[1][0]), y, minimum=0)) else: # This is a temporal variable that will not be finally selected but only used to calculate the heuristic su.append( self.compute_sufs_cached( current_su, current_transformed_features_numpy, X[:, neighbour[1][0]] == neighbour[1][1], self.current_features, FeatureOperand(feature_index=neighbour[1][0], value=neighbour[1][1]), y, minimum=0)) else: su = np.ones(len(neighbours)) probabilities = self.compute_probability(pheromones, np.array(su)) index = self.choose_next(probabilities, random_generator) su = su[index] node_id, selected_node = neighbours[index][:2] if current_transformed_features_numpy.shape[1] > len( self.current_features): current_transformed_features_numpy = np.delete( current_transformed_features_numpy, -1, axis=1) self.final_score = self.evaluate_loo( self.current_features, classifier, current_transformed_features_numpy, y) return self.final_score