def fit(self, X, y): self.feature_encoder_ = CustomOrdinalFeatureEncoder() self.class_encoder_ = CustomLabelEncoder() if isinstance(X, pd.DataFrame): self.categories_ = X.columns if self.encode_data: X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) classifier_ = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) self.n_features = X.shape[1] if self.encode_data: self.unique_values = [ values.shape[0] for values in self.feature_encoder_.categories_ ] else: self.unique_values = [ np.unique(X[:, j]).shape[0] for j in range(X.shape[1]) ] random.seed(self.seed) np.random.seed(self.seed) self.size = np.ceil(np.sqrt(X.shape[1])) best_individual = self.execute_algorithm(X, y) self.best_features = best_individual self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric) self.classifier_.fit( np.concatenate( [feature.transform(X) for feature in self.best_features], axis=1), y) return self
def test_incremental_validation(X=None, y=None, iterations=10, verbose=1): if not X: X, y = make_classification(n_samples=500, n_features=1000, n_informative=20, n_redundant=1, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, class_sep=1, hypercube=False, scale=1.0, shuffle=True, random_state=0) X //= 10 # --> To be able to evaluate categoricalNB # classifiers nb_classifier = NaiveBayes(encode_data=True) nb_classifier_no_encoding = NaiveBayes(encode_data=False) custom_encoder = CustomOrdinalFeatureEncoder() cnb = CategoricalNB() # accumulators categorical_nb = [] custom_nb_val_1 = [] custom_nb_val_2 = [] custom_nb_val_3 = [] custom_nb_val_4 = [] for i in range(iterations): if verbose: print(f"Iteration {i}") ts = time() X2 = custom_encoder.fit_transform(X) ts = time() score_2 = nb_classifier.leave_one_out_cross_val(X, y) custom_nb_val_1.append(time() - ts) ts = time() score_4 = cross_leave_one_out(nb_classifier, X, y) custom_nb_val_3.append(time() - ts) ts = time() X2 = custom_encoder.fit_transform(X) score_5 = cross_leave_one_out(nb_classifier_no_encoding, X2, y) custom_nb_val_4.append(time() - ts) if i == 0: score_1 = score_2 scores = [score_1, score_2, score_4, score_5] assert all(score == scores[0] for score in scores) print("Categorical with scikit loo: ", np.mean(categorical_nb[1:])) print("Custom with scikit loo: ", np.mean(custom_nb_val_3[1:])) print("Custom with scikit loo (pre-encoding): ", np.mean(custom_nb_val_4[1:])) print("Custom with first incremental: ", np.mean(custom_nb_val_1[1:]))
def test_remove_feature(): X, y = make_classification(n_samples=1000, n_features=100, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, class_sep=1.0, hypercube=True, scale=2.0, shuffle=True, random_state=0) nb = CustomNaiveBayes(encode_data=True) nb.fit(X, y) nb.remove_feature(0) independent = nb.indepent_term_ smoothed_log_counts_ = nb.smoothed_log_counts_ removed = nb.predict_proba(np.delete(X, 0, axis=1)) nb.fit(np.delete(X, 0, axis=1), y) og = nb.predict_proba(np.delete(X, 0, axis=1)) assert np.allclose(nb.smoothed_log_counts_, smoothed_log_counts_) assert np.allclose(nb.indepent_term_, independent) assert np.allclose(og, removed)
def test_add_features_with_index(): X, y = make_classification(n_samples=1000, n_features=100, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, class_sep=1.0, hypercube=True, scale=2.0, shuffle=True, random_state=0) X_og = X.copy() index = [0, 8, 9, 20] X_two_less = np.delete(X_og, index, axis=1) nb = CustomNaiveBayes(encode_data=True) nb.fit(X_two_less, y) nb.add_features(X_og[:, index], y, index=index) independent = nb.indepent_term_ smoothed_log_counts_ = nb.smoothed_log_counts_ added = nb.predict_proba(X) nb.fit(X, y) og = nb.predict_proba(X) assert np.allclose(nb.indepent_term_, independent) assert np.allclose(nb.smoothed_log_counts_, smoothed_log_counts_) assert np.allclose(og, added)
def time_comparison(combinations=None, n_iterations=15, verbose=1, seed=200): column_names = [ "Classifier", "n_samples", "n_features", "Average Fit Time", "STD Fit Time", "Average Predict Time", "STD Predict Time", "Score" ] results = [] if combinations is None: columns = range(10, 40010, 5000) rows = [10, 100, 1000] combinations = list(product(rows, columns)) + list( product(columns, rows)) combinations += list(product([10, 100, 1000], [500000])) combinations += list(product([500000], [10, 100, 1000])) clf_no_encoding = NaiveBayes(encode_data=False, alpha=1) clf_encoding = NaiveBayes(encode_data=True, alpha=1, discretize=False) clf_categorical_sklearn = CategoricalNB(alpha=1) clf_gaussian_sklearn = GaussianNB() progress_bar = tqdm(total=len(combinations), bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') X = [] y = [] for n_samples, n_features in combinations: if verbose: progress_bar.set_postfix({ "n_samples": n_samples, "n_features": n_features }) progress_bar.update(1) progress_bar.refresh() del X del y X, y = make_classification(n_samples=n_samples, n_features=n_features, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=2.0, shuffle=True, random_state=seed) X = make_discrete(X, m=1) X_train, X_test, y_train, y_test = X, X, y, y gaussian_nb_fit_time = [] gaussian_nb_predict_time = [] gaussian_nb_score = [] gaussian_nb_errors = 0 categorical_nb_fit_time = [] categorical_nb_predict_time = [] categorical_nb_score = [] categorical_nb_errors = 0 custom_no_encoding_nb_fit_time = [] custom_no_encoding_nb_predict_time = [] custom_no_encoding_nb_score = [] custom_no_encoding_nb_errors = 0 custom_encoding_nb_fit_time = [] custom_encoding_nb_predict_time = [] custom_encoding_nb_score = [] custom_encoding_nb_errors = 0 for _ in range(n_iterations): gaussian_nb_errors += evaluate(X_train, y_train, X_test, y_test, clf_gaussian_sklearn, gaussian_nb_fit_time, gaussian_nb_predict_time, gaussian_nb_score) categorical_nb_errors += evaluate(X_train, y_train, X_test, y_test, clf_categorical_sklearn, categorical_nb_fit_time, categorical_nb_predict_time, categorical_nb_score) custom_no_encoding_nb_errors += evaluate( X_train, y_train, X_test, y_test, clf_no_encoding, custom_no_encoding_nb_fit_time, custom_no_encoding_nb_predict_time, custom_no_encoding_nb_score) custom_encoding_nb_errors += evaluate( X_train, y_train, X_test, y_test, clf_encoding, custom_encoding_nb_fit_time, custom_encoding_nb_predict_time, custom_encoding_nb_score) update_df(results, "Gaussian", n_samples, n_features, gaussian_nb_fit_time, gaussian_nb_predict_time, gaussian_nb_score, gaussian_nb_errors) update_df(results, "Categorical", n_samples, n_features, categorical_nb_fit_time, categorical_nb_predict_time, categorical_nb_score, categorical_nb_errors) update_df(results, "Custom with encoding", n_samples, n_features, custom_encoding_nb_fit_time, custom_encoding_nb_predict_time, custom_encoding_nb_score, custom_encoding_nb_errors) update_df(results, "Custom without encoding", n_samples, n_features, custom_no_encoding_nb_fit_time, custom_no_encoding_nb_predict_time, custom_no_encoding_nb_score, custom_no_encoding_nb_errors) results_df = pd.DataFrame(results, columns=column_names) results_df.drop_duplicates(["Classifier", "n_samples", "n_features"], inplace=True) results_df.to_csv("backup.csv") return results_df
def acfs_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), verbose=True): # List to store results and column names for the csv result = [] columns = [ "Database", "Number of attributes", "NBScore", "NBScore STD", "ACFCS Score", "ACFCS Score STD", "Configuration", "Nodes", "Contruction Matrix", "Selection Matrix", "Selected_attributes", "Original" ] dataset_tqdm = tqdm(datasets) # Instantiate the classifier acfcs = ACFCS(verbose=0, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) # Update progressbar dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_score = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_selection_matrix = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_construction_matrix = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_nodes = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_dummy = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_selected = np.zeros(shape=(len(params), n_splits * n_repeats)) # Create splits for the experiments rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits * n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}' ) if verbose else rskf.split(X, y) # Execute experiments for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Encode the data c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers reusing info to speed up evaluation nb.fit(X_train, y_train) naive_bayes_score = nb.score(X_test, y_test) acfcs.reset_cache() for conf_index, conf in enumerate(params): acfcs.set_params(**conf) acfcs.fit(X_train, y_train, init_graph=conf_index == 0) # score acfcs_score_conf = acfcs.score(X_test, y_test) if verbose: seed_tqdm.set_postfix({ "config": conf_index, "nb_score": naive_bayes_score, "ant_score": acfcs_score_conf }) # Get data n_original_features = len( list( filter( lambda x: isinstance(x, DummyFeatureConstructor), acfcs.best_features))) n_selected = len(acfcs.best_features) selection_matrix = len(acfcs.afg.pheromone_selection) construction_matrix = len(acfcs.afg.pheromone_construction) nodes = len(acfcs.afg.nodes) # Update nb_score[conf_index, i] = naive_bayes_score acfcs_score[conf_index, i] = acfcs_score_conf acfcs_selection_matrix[conf_index, i] = selection_matrix acfcs_construction_matrix[conf_index, i] = construction_matrix acfcs_nodes[conf_index, i] = nodes acfcs_dummy[conf_index, i] = n_original_features acfcs_selected[conf_index, i] = n_selected # Insert the final result - averaged metrics for this database. for conf_index, conf in enumerate(params): row = [ name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(acfcs_score[conf_index]), np.std(acfcs_score[conf_index]), conf, np.mean(acfcs_nodes[conf_index]), np.mean(acfcs_construction_matrix[conf_index]), np.mean(acfcs_selection_matrix[conf_index]), np.mean(acfcs_selected[conf_index]), np.mean(acfcs_dummy[conf_index]) ] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results("ACFCS", email_data, result) return result
def filter_features(self, X, y): '''After the rank is built this perform the greedy wrapper search''' check_is_fitted(self) self.classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) current_score = np.NINF first_iteration = True current_features = [] current_data = None if self.use_initials: # Original Features have already been taken into account rank_iter = filter( lambda x: not isinstance(self.all_feature_constructors[x], DummyFeatureConstructor), iter(self.rank)) # Deep copy to avoid issues when modifying the list current_features = deepcopy(self.initial_backward_features) current_data = np.concatenate( [f.transform(X) for f in current_features], axis=1) # Get initial LOO score current_score = self.evaluate_leave_one_out_cross_val( self.classifier, current_features, current_data, y, fit=True) else: # Iterator over the sorted list of indexes rank_iter = iter(self.rank) if self.verbose: progress_bar = tqdm(total=len(self.rank), bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') iteration = 0 iterations_without_improvements = 0 # Loop for including {block size} elements at a time # Rank is an iterator, so the for loop is not sequential! for feature_constructor_index in rank_iter: iteration += 1 if self.verbose: progress_bar.set_postfix({ "n_features": len(current_features), "score": current_score }) progress_bar.update(1) progress_bar.refresh() # Add block size features new_X = [ self.all_feature_constructors[feature_constructor_index]. transform(X) ] selected_features = [ self.all_feature_constructors[feature_constructor_index] ] for _ in range(self.block_size - 1): try: index = next(rank_iter) selected_features.append( self.all_feature_constructors[index]) new_X.append( self.all_feature_constructors[index].transform(X)) if self.verbose: progress_bar.update(1) progress_bar.refresh() except: # Block size does not divide the number of elements in the rank. The search is halted break # Evaluate features new_X = np.concatenate(new_X, axis=1) if iteration == 1 and not self.use_initials: current_data = new_X current_score = self.evaluate_leave_one_out_cross_val( self.classifier, selected_features, current_data, y, fit=True) current_features = selected_features first_iteration = False if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break continue data = np.concatenate([current_data, new_X], axis=1) self.classifier.add_features(new_X, y) # LOO evaluation score = self.evaluate_leave_one_out_cross_val(self.classifier, current_features + selected_features, data, y, fit=False) if score > current_score: current_score = score current_data = data current_features.extend(selected_features) iterations_without_improvements = 0 else: iterations_without_improvements += 1 # Remove last added block for feature_index_to_remove in range( data.shape[1], data.shape[1] - new_X.shape[1], -1): self.classifier.remove_feature(feature_index_to_remove - 1) if self.strategy == "eager" and self.max_err < iterations_without_improvements: # Stops as soon as no impovement break if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break if self.verbose: progress_bar.close() print( f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}" ) self.final_feature_constructors = current_features return self
def scoring_comparison(base_path,datasets,verbose=1,test_size=0.3,seed=None,n_iterations=30): column_names = ["dataset", "custom_training_score", "custom_test_score", "categorical_training_score", "categorical_test_score"] data =[] clf_no_encoding = NaiveBayes(encode_data=True) clf_categorical_sklearn = CategoricalNB() datasets_iter = tqdm(datasets, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') c = CustomOrdinalFeatureEncoder() l = CustomLabelEncoder() for dataset in datasets_iter: dataset_name, label = dataset data_filename = f"{dataset_name}.data.csv" test_filename = f"{dataset_name}.test.csv" X, y = get_X_y_from_database(base_path=base_path, name = dataset_name, data = data_filename, test = test_filename, label = label) custom_train = [] custom_test = [] sklearn_train = [] sklearn_test = [] X = c.fit_transform(X) y = l.fit_transform(y) for iteration in range(n_iterations): if verbose: datasets_iter.set_postfix({"Dataset": dataset_name, "seed":iteration}) datasets_iter.refresh() try: X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=test_size, random_state=seed+iteration, shuffle=True, stratify=y) except: #Not enough values to stratify y X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=test_size, random_state=seed+iteration, shuffle=True ) #Fit clf_no_encoding.fit(X_train,y_train) clf_categorical_sklearn.min_categories = [1+np.max(np.concatenate([X_train[:,j],X_test[:,j]])) for j in range(X_train.shape[1])] clf_categorical_sklearn.fit(X_train,y_train) #Predict custom_train.append(clf_no_encoding.score(X_train,y_train)) custom_test.append(clf_no_encoding.score(X_test,y_test)) sklearn_train.append(clf_categorical_sklearn.score(X_train,y_train)) sklearn_test.append(clf_categorical_sklearn.score(X_test,y_test)) data.append([dataset_name,np.mean(custom_train),np.mean(custom_test),np.mean(sklearn_train),np.mean(sklearn_test)]) return pd.DataFrame(data,columns = column_names)
def ranker_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), share_rank=True): result = [] columns = ["Database", "Number of attributes", "NBScore", "NBScore STD", "Ranker Score", "Ranker Score STD", "Configuration", "Combinations", "Selected_attributes", "Original"] dataset_tqdm = tqdm(datasets) # Instantiate the classifier r = RankerLogicalFeatureConstructor(n_intervals=n_intervals, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits*n_repeats)) r_score = np.zeros(shape=(len(params), n_splits*n_repeats)) r_combinations = np.zeros(shape=(len(params), n_splits*n_repeats)) r_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) r_dummy = np.zeros(shape=(len(params), n_splits*n_repeats)) r_total_constructed = np.zeros(shape=(len(params), n_splits*n_repeats)) r_total_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) r_original_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits*n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers nb.fit(X=X_train, y=y_train) naive_bayes_score = nb.score(X_test, y_test) for conf_index, conf in enumerate(params): seed_tqdm.set_postfix({"config": conf_index}) r.set_params(**conf) # Fit if conf_index == 0 or not share_rank: # The rank is computed from scratch r.fit(X_train, y_train) else: r.filter_features(r.feature_encoder_.transform( X_train), r.class_encoder_.transform(y_train)) # score ranker_score = r.score(X_test, y_test) # Get data n_original_features = len(list(filter(lambda x: isinstance( x, DummyFeatureConstructor), r.final_feature_constructors))) n_combinations = len(r.all_feature_constructors) n_selected = len(r.final_feature_constructors) # Update nb_score[conf_index, i] = naive_bayes_score r_score[conf_index, i] = ranker_score r_combinations[conf_index, i] = n_combinations r_selected[conf_index, i] = n_selected r_dummy[conf_index, i] = n_original_features # Insert to final result averaged metrics for this dataset for conf_index, conf in enumerate(params): row = [name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(r_score[conf_index]), np.std(r_score[conf_index]), conf, np.mean(r_combinations[conf_index]), np.mean(r_selected[conf_index]), np.mean(r_dummy[conf_index])] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results("RANKER", email_data, result) return result
def genetic_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), verbose=True, version=1): result = [] columns = [ "Database", "Number of attributes", "NBScore", "NBScore STD", "Genetic Score", "Genetic Score STD", "Configuration", "Selected_attributes", "Original" ] dataset_tqdm = tqdm(datasets) # Instantiate the classifier if version == 1: # First Version - No flexibility in the number of attributes (bad performance) # clf = GeneticProgramming(seed=seed, metric=metric) clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric) elif version == 2: # Version with flexibility clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric) else: # Guided mutation based on SU clf = GeneticProgrammingRankMutation(seed=seed, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_score = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_selected = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_dummy = np.zeros(shape=(len(params), n_splits * n_repeats)) # Create splits for the experiments rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits * n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}' ) if verbose else rskf.split(X, y) # Execute experiments for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Encode the data c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers reusing info to speed up evaluation nb.fit(X_train, y_train) naive_bayes_score = nb.score(X_test, y_test) # Reset evaluation-cache for new split clf.reset_evaluation() for conf_index, conf in enumerate(params): if verbose: seed_tqdm.set_postfix({"config": conf_index}) clf.set_params(**conf) clf.fit(X_train, y_train) # score genetic_score = clf.score(X_test, y_test) # Get data n_original_features = len( list( filter( lambda x: isinstance(x, DummyFeatureConstructor), clf.best_features))) n_selected = len(clf.best_features) # Update nb_score[conf_index, i] = naive_bayes_score clf_score[conf_index, i] = genetic_score clf_selected[conf_index, i] = n_selected clf_dummy[conf_index, i] = n_original_features # Insert to final result averaged metrics for this database for conf_index, conf in enumerate(params): row = [ name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(clf_score[conf_index]), np.std(clf_score[conf_index]), conf, np.mean(clf_selected[conf_index]), np.mean(clf_dummy[conf_index]) ] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results(f"GENETIC_{version}", email_data, result) return result
def fit(self, X, y, init_graph=True): self.feature_encoder_ = CustomOrdinalFeatureEncoder() self.class_encoder_ = CustomLabelEncoder() self.categories_ = None if isinstance(X, pd.DataFrame): self.categories_ = X.columns if self.encode_data: X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if init_graph: if self.graph_strategy == "full": #Full graph self.afg = AntFeatureGraph(seed=self.seed).compute_graph( X, y, ("XOR", "OR", "AND")) else: #Pruned graph self.afg = AntFeatureGraphMI( seed=self.seed, connections=self.connections).compute_graph( X, y, ("XOR", "OR", "AND")) else: self.afg.reset_pheromones() if self.verbose: print(f"Number of nodes: {len(self.afg.nodes)}") random.seed(self.seed) best_score = 0 self.best_features = [] iterations_without_improvement = 0 iterator = tqdm(range(self.iterations)) if self.verbose else range( self.iterations) beta = self.beta distance_from_best = -1 for iteration in iterator: if self.verbose: iterator.set_postfix({ "best_score": best_score, "n_features": len(self.best_features), "p_matrix_c": len(self.afg.pheromone_construction), "p_matrix_s": len(self.afg.pheromone_selection), "distance_from_best": distance_from_best }) ants = [ Ant(ant_id=i, alpha=self.alpha, beta=beta, metric=self.metric, use_initials=self.use_initials, cache_loo=self.cache_loo, cache_heuristic=self.cache_heuristic, step=self.step) for i in range(self.ants) ] beta *= (1 - self.beta_evaporation_rate) results = [] for ant in ants: results.append( ant.run(X=X, y=y, graph=self.afg, random_generator=random, parallel=self.parallel, max_errors=self.max_errors)) results = np.array(results) self.afg.update_pheromone_matrix_evaporation(self.evaporation_rate) distance_from_best = np.mean(np.abs(results - best_score)) best_ant = np.argmax(results) if self.update_strategy == "best": ant = ants[best_ant] self.afg.intensify(ant.current_features, self.intensification_factor, 1, self.use_initials) else: for ant_score, ant in zip(results, ants): self.afg.intensify(ant.current_features, self.intensification_factor, ant_score, self.use_initials) if results[best_ant] >= best_score: iterations_without_improvement = 0 ant = ants[best_ant] best_score = results[best_ant] self.best_features = ant.current_features else: iterations_without_improvement += 1 if iterations_without_improvement > self.early_stopping: break self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric) if self.final_selection == "BEST": pass else: #An ant traverses the graph deterministically to obtain the features final_ant = FinalAnt(ant_id=0, alpha=self.alpha, beta=beta, metric=self.metric, use_initials=self.use_initials, cache_loo=self.cache_loo, cache_heuristic=self.cache_heuristic, step=self.step) final_ant.run(X=X, y=y, graph=self.afg, random_generator=random, parallel=self.parallel) self.best_features = final_ant.current_features #Train model with final features self.classifier_.fit( np.concatenate( [feature.transform(X) for feature in self.best_features], axis=1), y) if self.save_features: #Save to features to dict translate_features(features=self.best_features, feature_encoder=self.feature_encoder_, categories=self.categories_, path=self.path, filename=self.filename) return self
class ACFCS(OptimizationMixin, TransformerMixin, ClassifierMixin, BaseEstimator): def __init__(self, ants=10, evaporation_rate=0.05, intensification_factor=0.05, alpha=1.0, beta=0.0, beta_evaporation_rate=0.05, step=1, iterations=100, early_stopping=20, update_strategy="best", seed=None, parallel=False, save_features=False, path=None, filename=None, verbose=0, graph_strategy="mutual_info", connections=2, max_errors=0, metric="accuracy", use_initials=False, final_selection="ALL", encode_data=True): self.step = step self.ants = ants self.evaporation_rate = evaporation_rate self.intensification_factor = intensification_factor self.alpha = alpha self.beta = beta self.beta_evaporation_rate = beta_evaporation_rate self.iterations = iterations self.early_stopping = early_stopping self.seed = seed self.parallel = parallel self.save_features = save_features self.path = path self.filename = filename self.verbose = verbose self.graph_strategy = graph_strategy self.connections = connections self.metric = metric self.update_strategy = update_strategy self.use_initials = use_initials self.final_selection = final_selection self.encode_data = encode_data self.max_errors = max_errors allowed_graph_strategy = ("full", "mutual_info") if self.graph_strategy not in allowed_graph_strategy: raise ValueError( "Unknown graph strategy type: %s, expected one of %s." % (self.graph_strategy, allowed_graph_strategy)) allowed_update_strategy = ("all", "best") if self.update_strategy not in allowed_update_strategy: raise ValueError( "Unknown graph strategy type: %s, expected one of %s." % (self.update_strategy, allowed_update_strategy)) self.reset_cache() def reset_cache(self): self.cache_loo = dict() self.cache_heuristic = dict() def fit(self, X, y, init_graph=True): self.feature_encoder_ = CustomOrdinalFeatureEncoder() self.class_encoder_ = CustomLabelEncoder() self.categories_ = None if isinstance(X, pd.DataFrame): self.categories_ = X.columns if self.encode_data: X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if init_graph: if self.graph_strategy == "full": #Full graph self.afg = AntFeatureGraph(seed=self.seed).compute_graph( X, y, ("XOR", "OR", "AND")) else: #Pruned graph self.afg = AntFeatureGraphMI( seed=self.seed, connections=self.connections).compute_graph( X, y, ("XOR", "OR", "AND")) else: self.afg.reset_pheromones() if self.verbose: print(f"Number of nodes: {len(self.afg.nodes)}") random.seed(self.seed) best_score = 0 self.best_features = [] iterations_without_improvement = 0 iterator = tqdm(range(self.iterations)) if self.verbose else range( self.iterations) beta = self.beta distance_from_best = -1 for iteration in iterator: if self.verbose: iterator.set_postfix({ "best_score": best_score, "n_features": len(self.best_features), "p_matrix_c": len(self.afg.pheromone_construction), "p_matrix_s": len(self.afg.pheromone_selection), "distance_from_best": distance_from_best }) ants = [ Ant(ant_id=i, alpha=self.alpha, beta=beta, metric=self.metric, use_initials=self.use_initials, cache_loo=self.cache_loo, cache_heuristic=self.cache_heuristic, step=self.step) for i in range(self.ants) ] beta *= (1 - self.beta_evaporation_rate) results = [] for ant in ants: results.append( ant.run(X=X, y=y, graph=self.afg, random_generator=random, parallel=self.parallel, max_errors=self.max_errors)) results = np.array(results) self.afg.update_pheromone_matrix_evaporation(self.evaporation_rate) distance_from_best = np.mean(np.abs(results - best_score)) best_ant = np.argmax(results) if self.update_strategy == "best": ant = ants[best_ant] self.afg.intensify(ant.current_features, self.intensification_factor, 1, self.use_initials) else: for ant_score, ant in zip(results, ants): self.afg.intensify(ant.current_features, self.intensification_factor, ant_score, self.use_initials) if results[best_ant] >= best_score: iterations_without_improvement = 0 ant = ants[best_ant] best_score = results[best_ant] self.best_features = ant.current_features else: iterations_without_improvement += 1 if iterations_without_improvement > self.early_stopping: break self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric) if self.final_selection == "BEST": pass else: #An ant traverses the graph deterministically to obtain the features final_ant = FinalAnt(ant_id=0, alpha=self.alpha, beta=beta, metric=self.metric, use_initials=self.use_initials, cache_loo=self.cache_loo, cache_heuristic=self.cache_heuristic, step=self.step) final_ant.run(X=X, y=y, graph=self.afg, random_generator=random, parallel=self.parallel) self.best_features = final_ant.current_features #Train model with final features self.classifier_.fit( np.concatenate( [feature.transform(X) for feature in self.best_features], axis=1), y) if self.save_features: #Save to features to dict translate_features(features=self.best_features, feature_encoder=self.feature_encoder_, categories=self.categories_, path=self.path, filename=self.filename) return self
class PazzaniWrapperNB(PazzaniWrapper): ''''Optimized version of Pazzani's wrapper for the Naive Bayes classifier. LOO cross validation Update, add, delete features ''' def __init__(self, seed=None, strategy="BSEJ", verbose=0): super().__init__(seed=seed, strategy=strategy, verbose=verbose, cv=None) def _generate_neighbors_bsej(self, current_columns, X): if X.shape[1] > 1: for column_to_drop in range(X.shape[1]): new_columns = current_columns.copy() del new_columns[column_to_drop] yield new_columns, column_to_drop, None, True # Updated column list, columns to remove, columns to add, delete? for features in combinations(np.arange(X.shape[1]), 2): new_col_name = flatten([ current_columns[features[0]], current_columns[features[1]] ]) new_columns = current_columns.copy() new_columns.append(tuple(new_col_name)) columns_to_drop = sorted(features, reverse=True) del new_columns[columns_to_drop[0]] del new_columns[columns_to_drop[1]] combined_columns = combine_columns(X, list(features)) yield new_columns, list( columns_to_drop), combined_columns, False def fit_bsej(self, X, y): self.evaluate = memoize(_evaluate, attribute_to_cache="columns") current_best = X.copy() current_columns = deque(range(X.shape[1])) best_score = self.evaluate(self.classifier, current_best, y, columns=current_columns, fit=True) stop = False while not stop: update = False stop = True if self.verbose: print("Current Best: ", current_columns, " Score: ", best_score) for new_columns, columns_to_delete, columns_to_add, delete in self._generate_neighbors_bsej( current_columns, current_best): if delete: action = "DELETE" # Update classifier and get validation result self.classifier.remove_feature(columns_to_delete) neighbor = np.delete(current_best, columns_to_delete, axis=1) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) # Restore the column for the next iteration self.classifier.add_features( current_best[:, columns_to_delete].reshape(-1, 1), y, index=[columns_to_delete]) else: action = "ADD" self.classifier.add_features(columns_to_add, y) self.classifier.remove_feature(columns_to_delete[0]) self.classifier.remove_feature(columns_to_delete[1]) neighbor = np.delete(current_best, columns_to_delete, axis=1) neighbor = np.concatenate([neighbor, columns_to_add], axis=1) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) if self.classifier.n_features_ == 1: # We reverse it for insert order self.classifier.add_features( current_best[:, columns_to_delete], y) self.classifier.remove_feature(0) else: self.classifier.remove_feature(neighbor.shape[1] - 1) # We reverse it for insert order self.classifier.add_features( current_best[:, columns_to_delete], y, index=columns_to_delete) if self.verbose == 2: print("\tNeighbor: ", new_columns, " Score: ", score) if score > best_score: stop = False best_columns = new_columns best_action = action best_score = score best_columns_to_delete = columns_to_delete update = True if best_action == "ADD": best_columns_to_add = columns_to_add if score == 1.0: stop = True break if update: current_columns = best_columns if best_action == "DELETE": current_best = np.delete(current_best, best_columns_to_delete, axis=1) # Update best self.classifier.remove_feature(best_columns_to_delete) else: current_best = np.delete(current_best, best_columns_to_delete, axis=1) current_best = np.concatenate( [current_best, best_columns_to_add], axis=1) # Update classifier self.classifier.add_features(best_columns_to_add, y) self.classifier.remove_feature(best_columns_to_delete[0]) self.classifier.remove_feature(best_columns_to_delete[1]) if self.verbose: print("Final best: ", list(current_columns), " Score: ", best_score) self.features_ = current_columns self.feature_transformer = lambda X: join_columns( X, columns=self.features_) model = self.classifier.fit(self.feature_transformer(X), y) return self def _generate_neighbors_fssj(self, current_columns, individual, original_data, available_columns): if available_columns: for index, col in enumerate(available_columns): new_columns = current_columns.copy() new_columns.append(col) new_available_columns = available_columns.copy() del new_available_columns[index] column_to_add = original_data[:, col].reshape(-1, 1) column_to_delete = None # New columns, Availables,ColumnToDelete,ColumnToAdd,Delete? yield new_columns, new_available_columns, column_to_delete, column_to_add, False if individual is not None and individual.shape[ 1] > 0 and available_columns: for features_index in product(np.arange(len(available_columns)), np.arange(len(current_columns))): features = available_columns[ features_index[0]], current_columns[features_index[1]] new_col_name = flatten([features[0], features[1]]) new_available_columns = available_columns.copy() del new_available_columns[features_index[0]] new_columns = current_columns.copy() new_columns.append(tuple(new_col_name)) del new_columns[features_index[1]] separated_columns = np.concatenate([ original_data[:, features[0]].reshape(-1, 1), individual[:, features_index[1]].reshape(-1, 1) ], axis=1) if isinstance(features[1], tuple): features = list(features) features[1] = list(features[1]) features = tuple(features) column_to_delete = features_index[1] combined_columns = combine_columns(separated_columns) column_to_add = combined_columns yield new_columns, new_available_columns, column_to_delete, column_to_add, True def fit_fssj(self, X, y): self.evaluate = memoize(_evaluate, attribute_to_cache="columns") current_best = None current_columns = deque() available_columns = list(range(X.shape[1])) best_score = -float("inf") stop = False while not stop: update = False stop = True # self.classifier.encode_data=True if self.verbose: print("Current Best: ", current_columns, " Score: ", best_score, "Available columns: ", available_columns) for new_columns, new_available_columns, column_to_delete, column_to_add, delete in self._generate_neighbors_fssj( current_columns=current_columns, individual=current_best, original_data=X, available_columns=available_columns): if delete: action = "JOIN" # Update classifier and get validation result self.classifier.add_features(column_to_add, y) self.classifier.remove_feature(column_to_delete) neighbor = np.concatenate([current_best, column_to_add], axis=1) neighbor = np.delete(neighbor, column_to_delete, axis=1) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) # Restore the column for the next iteration if neighbor.shape[1] == 1: self.classifier.fit(current_best, y) else: self.classifier.remove_feature(neighbor.shape[1] - 1) self.classifier.add_features( current_best[:, column_to_delete].reshape(-1, 1), y, index=[column_to_delete]) else: action = "ADD" if current_best is None: neighbor = column_to_add self.classifier.fit(neighbor, y) else: neighbor = np.concatenate( [current_best, column_to_add], axis=1) self.classifier.add_features(column_to_add, y) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) if current_best is None: self.classifier = NaiveBayes(encode_data=True) else: self.classifier.remove_feature(neighbor.shape[1] - 1) if self.verbose == 2: print("\tNeighbour: ", new_columns, " Score: ", score, "Available columns: ", new_available_columns) if score > best_score: stop = False best_columns = new_columns best_available_columns = new_available_columns best_action = action best_score = score best_column_to_delete = column_to_delete best_column_to_add = column_to_add update = True if score == 1.0: stop = True break if update: current_columns = best_columns available_columns = best_available_columns if best_action == "JOIN": self.classifier.add_features(best_column_to_add, y) self.classifier.remove_feature(best_column_to_delete) current_best = np.concatenate( [current_best, best_column_to_add], axis=1) current_best = np.delete(current_best, best_column_to_delete, axis=1) else: if current_best is None: current_best = best_column_to_add self.classifier.fit(current_best, y) else: current_best = np.concatenate( [current_best, best_column_to_add], axis=1) self.classifier.add_features(best_column_to_add, y) if self.verbose: print("Final best: ", list(current_columns), " Score: ", best_score) self.features_ = current_columns self.feature_transformer = lambda X: join_columns( X, columns=self.features_) model = self.classifier.fit(self.feature_transformer(X), y) return self def evaluate(self, classifier, X, y, fit=True, columns=None): return _evaluate(classifier, X, y, fit=True, columns=None)
def fit_fssj(self, X, y): self.evaluate = memoize(_evaluate, attribute_to_cache="columns") current_best = None current_columns = deque() available_columns = list(range(X.shape[1])) best_score = -float("inf") stop = False while not stop: update = False stop = True # self.classifier.encode_data=True if self.verbose: print("Current Best: ", current_columns, " Score: ", best_score, "Available columns: ", available_columns) for new_columns, new_available_columns, column_to_delete, column_to_add, delete in self._generate_neighbors_fssj( current_columns=current_columns, individual=current_best, original_data=X, available_columns=available_columns): if delete: action = "JOIN" # Update classifier and get validation result self.classifier.add_features(column_to_add, y) self.classifier.remove_feature(column_to_delete) neighbor = np.concatenate([current_best, column_to_add], axis=1) neighbor = np.delete(neighbor, column_to_delete, axis=1) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) # Restore the column for the next iteration if neighbor.shape[1] == 1: self.classifier.fit(current_best, y) else: self.classifier.remove_feature(neighbor.shape[1] - 1) self.classifier.add_features( current_best[:, column_to_delete].reshape(-1, 1), y, index=[column_to_delete]) else: action = "ADD" if current_best is None: neighbor = column_to_add self.classifier.fit(neighbor, y) else: neighbor = np.concatenate( [current_best, column_to_add], axis=1) self.classifier.add_features(column_to_add, y) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) if current_best is None: self.classifier = NaiveBayes(encode_data=True) else: self.classifier.remove_feature(neighbor.shape[1] - 1) if self.verbose == 2: print("\tNeighbour: ", new_columns, " Score: ", score, "Available columns: ", new_available_columns) if score > best_score: stop = False best_columns = new_columns best_available_columns = new_available_columns best_action = action best_score = score best_column_to_delete = column_to_delete best_column_to_add = column_to_add update = True if score == 1.0: stop = True break if update: current_columns = best_columns available_columns = best_available_columns if best_action == "JOIN": self.classifier.add_features(best_column_to_add, y) self.classifier.remove_feature(best_column_to_delete) current_best = np.concatenate( [current_best, best_column_to_add], axis=1) current_best = np.delete(current_best, best_column_to_delete, axis=1) else: if current_best is None: current_best = best_column_to_add self.classifier.fit(current_best, y) else: current_best = np.concatenate( [current_best, best_column_to_add], axis=1) self.classifier.add_features(best_column_to_add, y) if self.verbose: print("Final best: ", list(current_columns), " Score: ", best_score) self.features_ = current_columns self.feature_transformer = lambda X: join_columns( X, columns=self.features_) model = self.classifier.fit(self.feature_transformer(X), y) return self
def fit(self, X, y): # Parse input if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: self.feature_encoder_ = CustomOrdinalFeatureEncoder( n_intervals=self.n_intervals) self.class_encoder_ = CustomLabelEncoder() X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) # Reset the stored results for new fit self.reset_evaluation() # Generate rank if self.use_graph: # Construct the minimum graph and create rank graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph( X, y, ("AND", "OR", "XOR")) self.all_feature_constructors = graph.get_rank() elif self.prune is not None: # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y) feature_combinations = list( combinations(list(range(X.shape[1])), 2)) + [(i, i) for i in range(X.shape[1])] rank_pairs = [ symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y) for i, j in feature_combinations ] rank_pairs_index = np.argsort(rank_pairs)[::-1] # Create the unsorted list self.all_feature_constructors = [] for index in rank_pairs_index[:self.prune]: i, j = feature_combinations[index] if i == j: from tfg.feature_construction import create_feature self.all_feature_constructors.extend([ create_feature("OR", [(i, n), (i, m)]) for n, m in combinations(np.unique(X[:, i]), 2) ]) else: self.all_feature_constructors.extend( construct_features(X[:, [i, j]], operators=self.operators, same_feature=False)) else: # Create the unsorted list of all features self.all_feature_constructors = construct_features( X, operators=self.operators) if self.verbose: print( f"Total number of constructed features: {len(self.all_feature_constructors)}" ) self.all_feature_constructors.extend( [DummyFeatureConstructor(j) for j in range(X.shape[1])]) self.symmetrical_uncertainty_rank = [] # Sort the ranking for feature_constructor in self.all_feature_constructors: feature = feature_constructor.transform(X) su = symmetrical_uncertainty(f1=feature.flatten(), f2=y) self.symmetrical_uncertainty_rank.append(su) # Store the descending order index self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1] # If the initial variables are if self.use_initials: classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) classifier.fit(X, y) current_features = [ DummyFeatureConstructor(j) for j in range(X.shape[1]) ] # Store the backward result to reuse it for other executions self.initial_backward_features = backward_search( X, y, current_features, classifier) # Feature Subset Selection (FSS) from the rank self.filter_features(X, y) return self
class GeneticProgrammingFlexibleLogic(OptimizationMixin, TransformerMixin, ClassifierMixin, BaseEstimator): """GeneticProgramming for Feature Construction and Selection. Parameters ---------- seed : int or None Seed to guarantee reproducibility individuals : int Number of individuals per population generations : int Number of generations mutation_probability : float Probability for each individual of being mutated select : {rank,proportionate} Selection strategy mutation : {simple,complex} Mutation strategy combine : {truncation,elitism} Population combination strategy n_intervals : int Number of intervals for the discretization of continous variables mixed : bool Mix heuristic and wrapper evaluation mixed_percentage : float Percentage of total iterations to do heuristic evaluation metric : {accuracy,f1-score} Target metric for the optimization process flexible_logic: bool Allow different individual sizes in the generation encode_data : bool, default=True Encode data when data is not encoded by default with an OrdinalEncoder verbose :int {0,1}, default = 1 Display process progress Attributes ---------- classifier_ : NaiveBayes Base classifier used for prediction best_features_ : array-lik of Feature Array of selected Feature used for transforming new data """ def simple_evaluate(self, individual, X, y): classifier_ = NaiveBayes(encode_data=False, metric=self.metric) return classifier_.leave_one_out_cross_val(transform_features( individual[0] + individual[1], X), y, fit=True) def simple_evaluate_heuristic(self, individual, X, y): return compute_sufs_non_incremental( features=[f.transform(X) for f in chain(*individual[:2])], y=y) def fitness(self, population, X, y): evaluation = [] for individual in population: evaluation.append((individual, self.evaluate(individual, X, y))) return evaluation def generate_population(self): population = [] for _ in range(self.individuals): individual = ([], [], set()) if self.flexible_logic: n_chromosomes = range(random.randint(1, self.size)) else: n_chromosomes = range(self.size) for _ in n_chromosomes: operand1_feature = random.randint(0, self.n_features - 1) operand2_feature = random.randint(0, self.n_features - 1) if operand1_feature == operand2_feature: op = 'OR' operand1_value = random.randint( 0, self.unique_values[operand1_feature] - 1) operand2_value = random.randint( 0, self.unique_values[operand1_feature] - 1) else: op = random.choice(('OR', 'XOR', 'AND')) operand1_value = random.randint( 0, self.unique_values[operand1_feature] - 1) operand2_value = random.randint( 0, self.unique_values[operand2_feature] - 1) operands = [] operands.append((operand1_feature, operand1_value)) operands.append((operand2_feature, operand2_value)) individual[1].append( create_feature(operator=op, operands=operands)) n_og_features = random.randint(0, self.n_features - 1) features = list(range(self.n_features)) for f in random.sample(features, n_og_features): individual[0].append(DummyFeatureConstructor(feature_index=f)) individual[2].add(f) population.append(individual) return population def mutate_complex(self, population, **kwargs): new_population = [] for individual in population: if random.random() < self.mutation_probability: chromosomes_index = [] if self.flexible_logic: if len(individual[1]) > 0: chromosomes_index = random.sample( list(range(len(individual[1]))), random.randint(1, len(individual[1]))) else: op = random.choice(('OR', 'XOR', 'AND')) operands = [] for _ in range(2): feature_index = random.randint( 0, self.n_features - 1) value = random.randint( 0, self.unique_values[feature_index] - 1) operands.append((feature_index, value)) individual[1].append( create_feature(operator=op, operands=operands)) new_population.append(individual) continue else: chromosomes_index = random.sample( list(range(len(individual[1]))), random.randint(1, len(individual[1]))) for i in range(len(chromosomes_index)): index = chromosomes_index[i] if not self.flexible_logic: feature = individual[1][index] feature.op = random.choice(('OR', 'XOR', 'AND')) for operand in feature.operands: operand.feature_index = random.randint( 0, self.n_features - 1) operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) else: a = random.random() if a < 0.33: feature = individual[1][index] feature.op = random.choice(('OR', 'XOR', 'AND')) for operand in feature.operands: operand.feature_index = random.randint( 0, self.n_features - 1) operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) elif a < 0.66: op = random.choice(('OR', 'XOR', 'AND')) operands = [] for _ in range(2): feature_index = random.randint( 0, self.n_features - 1) value = random.randint( 0, self.unique_values[feature_index] - 1) operands.append((feature_index, value)) individual[1].append( create_feature(operator=op, operands=operands)) else: del individual[1][index] chromosomes_index = [ j - 1 if j > index else j for j in chromosomes_index ] if random.random() < self.mutation_probability: a = random.random() og_features = individual[0] included_features = individual[2] if (a < 0.33 and len(og_features) < self.n_features ) or len(og_features) == 0: selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) included_features.add(selected) og_features.append(DummyFeatureConstructor(selected)) elif a < 0.66 and len(og_features) < self.n_features and len( og_features) > 0: selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) index = random.randint(0, len(og_features) - 1) feature = og_features[index].feature_index og_features[index] = DummyFeatureConstructor(selected) included_features.remove(feature) included_features.add(selected) else: index = random.randint(0, len(og_features) - 1) feature = og_features[index].feature_index del og_features[index] included_features.remove(feature) if len(individual[0]) == 0 and len(individual[1]) == 0: og_features = individual[0] included_features = individual[2] selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) included_features.add(selected) og_features.append(DummyFeatureConstructor(selected)) new_population.append(individual) return new_population def mutate_simple(self, population, **kwargs): new_population = [] for individual in population: if random.random() < self.mutation_probability: chromosomes_index = [] if self.flexible_logic: if len(individual[1]) > 0: chromosomes_index = random.sample( list(range(len(individual[1]))), random.randint(1, len(individual[1]))) else: op = random.choice(('OR', 'XOR', 'AND')) operands = [] for _ in range(2): feature_index = random.randint( 0, self.n_features - 1) value = random.randint( 0, self.unique_values[feature_index] - 1) operands.append((feature_index, value)) individual[1].append( create_feature(operator=op, operands=operands)) new_population.append(individual) continue else: chromosomes_index = random.sample( list(range(len(individual[1]))), random.randint(1, len(individual[1]))) for i in range(len(chromosomes_index)): index = chromosomes_index[i] feature = individual[1][index] if not self.flexible_logic: feature.op = random.choice(('OR', 'XOR', 'AND')) for operand in feature.operands: operand.feature_index = random.randint( 0, self.n_features - 1) operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) else: a = random.random() if a < 0.33: b = random.random() if b < 0.2: # Change operatior feature.op = random.choice( ('OR', 'XOR', 'AND')) elif b < 0.4: # Change full operand operand = feature.operands[0] operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) elif b < 0.6: # Change full operand operand = feature.operands[1] operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) elif b < 0.8: # Change value operand = feature.operands[0] operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) else: # Change value operand = feature.operands[1] operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) elif a < 0.66: # Add feature op = random.choice(('OR', 'XOR', 'AND')) operands = [] for _ in range(2): feature_index = random.randint( 0, self.n_features - 1) value = random.randint( 0, self.unique_values[feature_index] - 1) operands.append((feature_index, value)) individual[1].append( create_feature(operator=op, operands=operands)) else: # Remove feature del individual[1][index] chromosomes_index = [ j - 1 if j > index else j for j in chromosomes_index ] if random.random() < self.mutation_probability: a = random.random() og_features = individual[0] included_features = individual[2] if (a < 0.33 and len(og_features) < self.n_features ) or len(og_features) == 0: selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) included_features.add(selected) og_features.append(DummyFeatureConstructor(selected)) elif a < 0.66 and len(og_features) < self.n_features and len( og_features) > 0: selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) index = random.randint(0, len(og_features) - 1) feature = og_features[index].feature_index og_features[index] = DummyFeatureConstructor(selected) included_features.remove(feature) included_features.add(selected) else: index = random.randint(0, len(og_features) - 1) feature = og_features[index].feature_index del og_features[index] included_features.remove(feature) if len(individual[0]) == 0 and len(individual[1]) == 0: og_features = individual[0] included_features = individual[2] selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) included_features.add(selected) og_features.append(DummyFeatureConstructor(selected)) new_population.append(individual) return new_population def elitism(self, population1, population2): maximum = max(population1, key=lambda x: x[1]) minimum_index = min(enumerate(population2), key=lambda x: x[1][1])[0] population2[minimum_index] = maximum return population2 def truncation(self, population1, population2): return sorted(population1 + population2, reverse=True, key=lambda x: x[1])[:len(population1)] def select_population(self, population): selected_individuals = [] num_selected = len(population) totalFitness = sum(fitness for _, fitness in population) for _ in range(num_selected): cumulative_prob = 0.0 r = random.random() for individual_with_fitness in population: cumulative_prob += individual_with_fitness[1] / totalFitness if r <= cumulative_prob: selected_individuals.append( self.copy_individual(individual_with_fitness[0])) break return selected_individuals def select_population_rank(self, population): selected_individuals = [] num_selected = len(population) totalRank = (num_selected * (num_selected + 1)) / 2 population.sort(reverse=True, key=lambda x: x[1]) for _ in range(num_selected): cumulative_prob = 0.0 r = random.random() for i, individual_with_fitness in enumerate(population, start=1): cumulative_prob += (num_selected - i + 1) / totalRank if r <= cumulative_prob: selected_individuals.append( self.copy_individual(individual_with_fitness[0])) break return selected_individuals def copy_individual(self, individual): return ([chrms.copy() for chrms in individual[0]], [chrms.copy() for chrms in individual[1]], individual[2].copy()) def fit(self, X, y): self.feature_encoder_ = CustomOrdinalFeatureEncoder() self.class_encoder_ = CustomLabelEncoder() if isinstance(X, pd.DataFrame): self.categories_ = X.columns if self.encode_data: X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) classifier_ = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) self.n_features = X.shape[1] if self.encode_data: self.unique_values = [ values.shape[0] for values in self.feature_encoder_.categories_ ] else: self.unique_values = [ np.unique(X[:, j]).shape[0] for j in range(X.shape[1]) ] random.seed(self.seed) np.random.seed(self.seed) self.size = np.ceil(np.sqrt(X.shape[1])) best_individual = self.execute_algorithm(X, y) self.best_features = best_individual self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric) self.classifier_.fit( np.concatenate( [feature.transform(X) for feature in self.best_features], axis=1), y) return self def execute_algorithm(self, X, y): if self.mixed: self.evaluate = self.evaluate_heuristic else: self.evaluate = self.evaluate_wrapper population = self.generate_population() population_with_fitness = self.fitness(population, X, y) iterator = tqdm(range(self.generations), leave=False) if self.verbose else range( self.generations) for generation in iterator: if self.mixed and generation > int( self.generations * self.mixed_percentage ) and self.evaluate == self.evaluate_heuristic: self.evaluate = self.evaluate_wrapper # Reevaluate for fair combination population_with_fitness = self.fitness([ individual_with_fitness[0] for individual_with_fitness in population_with_fitness ], X, y) selected_individuals = self.selection(population_with_fitness) crossed_individuals = selected_individuals # self.crossover(selected_individuals) mutated_individuals = self.mutation(crossed_individuals, X=X, y=y) new_population = self.fitness(mutated_individuals, X, y) population_with_fitness = self.combine(population_with_fitness, new_population) # Obtaining population's statistics if self.verbose: best, mean = get_max_mean(population_with_fitness) iterator.set_postfix({ "Generation": generation, "hit_count": self.evaluate.hit_count, "populationLength": len(population_with_fitness), "best fitness": best, "mean fitness": mean }) best_individual = max(population_with_fitness, key=lambda x: x[1])[0] return best_individual[0] + best_individual[1] def reset_evaluation(self): self.evaluate_wrapper = memoize_genetic(self.simple_evaluate) self.evaluate_heuristic = memoize_genetic( self.simple_evaluate_heuristic) def set_params(self, **params): super().set_params(**params) if "selection" in params: if params["selection"] not in ("rank", "proportionate"): raise ValueError( "Unknown selection parameter expected one of : " + str(tuple(["rank", "proportionate"]))) self.selection = self.select_population_rank if "rank" in params[ "selection"] else self.select_population if "combine" in params: if params["combine"] not in ("elitism", "truncate"): raise ValueError( "Unknown selection parameter expected one of : " + str(tuple(["elitism", "truncate"]))) self.combine = self.elitism if "elit" in params[ "combine"] else self.truncation if "mutation" in params: if params["mutation"] not in ("complex", "simple"): raise ValueError( "Unknown selection parameter expected one of : " + str(tuple(["complex", "simple"]))) self.mutation = self.mutate_simple if "simple" == params[ "mutation"] else self.mutate_complex def __init__(self, seed=None, individuals=1, generations=40, mutation_probability=0.2, selection="rank", mutation="simple", combine="elitism", n_intervals=5, metric="accuracy", flexible_logic=True, verbose=False, encode_data=True, mixed=True, mixed_percentage=0.5): self.mixed_percentage = mixed_percentage self.mixed = mixed self.encode_data = encode_data self.flexible_logic = flexible_logic self.verbose = verbose self.n_intervals = n_intervals self.metric = metric self.seed = seed self.individuals = individuals self.generations = generations self.mutation_probability = mutation_probability self.selection = selection self.combine = combine self.mutation = mutation allowed_selection = ('rank', 'proportionate') allowed_combine = ('elitism', 'truncate') allowed_mutation = ('complex', 'simple') if self.selection not in allowed_selection: raise ValueError( "Unknown selection type: %s, expected one of %s." % (self.selection, selection)) if self.combine not in allowed_combine: raise ValueError("Unknown combine type: %s, expected one of %s." % (self.combine, combine)) if self.mutation not in allowed_mutation: raise ValueError( "Unknown selection type: %s, expected one of %s." % (self.mutation, mutation)) self.selection = self.select_population_rank if "rank" in selection else self.select_population self.combine = self.elitism if "elit" in combine else self.truncation self.mutation = self.mutate_simple if "simple" in mutation else self.mutate_complex self.reset_evaluation()
class RankerLogicalFeatureConstructor(TransformerMixin, ClassifierMixin, BaseEstimator): """First proposal: Hybrid-Ranker Wrapper. Build a ranking based on Symmetrical Uncertainty (SU) of every possible logical feature of depth 1 (1 operator, 2 operands), using XOR, AND and OR operator. The steps are: - Find out combinations of values in database of every pair of features Xi, Xj: - Example: - Xi = [1,2,3,2] - Xj = ['a','b','c','a'] Possible combinations: [(1,'a'),(2,'b'),(3,'c'),(2,'a')] - Apply operator to every combination: - Example: - Xi = [1,2,3,2] - Xj = ['a','b','c','a'] Possible combinations: [(1,'a','AND'),(2,'b','AND'),(3,'c','AND'),(2,'a','AND'), (1,'a','OR'),(2,'b','OR'),(3,'c','OR'),(2,'a','OR'), (1,'a','XOR'),(2,'b','XOR'),(3,'c','XOR'),(2,'a','XOR')] - Add original variables to the list - Evaluate SU for every value in the list, and rank them - Go over the list following one of the two strategies proposed and evaluate the subset based on a leave-one-out cross-validation with the NaiveBayes classifier. Parameters ---------- strategy : str {eager,skip} After the ranking is built if the eager strategy is chosen we stop considering attributes when there is no improvement from one iteration to the next block_size : int, default=1 Number of features that are added in each iteration encode_data : boolean Whether or not to encode the received data. If set to false the classifier expects data to be encoded with an ordinal encoder. verbose : {boolean,int} If set to true it displays information of the remaining time and inside variables. operators : array-like, deafult = ("XOR","AND","OR") Operators used for the constructed features. max_features : int, deafult = inf Maximum number of features to include in the selected subset max_iterations : int, deafult = inf Maximum number of iterations in the wrapper step. use_graph : bool, default = False Generate Ranking from features obtained from the pruned-graph of the ACO algorithm. (Experimentation not carried out) use_initials: bool, default = False Force the set of initial features in the final solution. The set if trimmed with a backward elimination before-hand. Attributes ---------- feature_encoder_ : CustomOrdinalFeatureEncoder or None Encodes data in ordinal way with unseen values handling if encode_data is set to True. class_encoder_ : LabelEncoder or None Encodes Data in ordinal way for the class if encode_data is set to True. all_feature_constructors: array-like List of FeatureConstructor objects with all the possible logical features symmetrical_uncertainty_rank: array-like SU for every feature in all_feature_constructors rank : array-like Array of indexes corresponding to the sorted SU rank (in descending order). final_feature_constructors: Selected feature subset (list of constructors) classifier: NaiveBayes Classifier used in the wrapper and to perform predictions after fitting. """ def __init__(self, strategy="eager", block_size=10, encode_data=True, n_intervals=5, verbose=0, operators=("AND", "OR", "XOR"), max_features=float("inf"), max_iterations=float("inf"), metric="accuracy", use_initials=False, max_err=0, prune=None, use_graph=False): self.strategy = strategy self.block_size = max(block_size, 1) self.encode_data = encode_data self.verbose = verbose self.operators = operators self.max_features = max_features self.max_iterations = max_iterations self.n_intervals = n_intervals self.metric = metric self.max_err = max_err self.use_initials = use_initials self.prune = prune self.use_graph = use_graph allowed_strategies = ("eager", "skip") if self.strategy not in allowed_strategies: raise ValueError("Unknown operator type: %s, expected one of %s." % (self.strategy, allowed_strategies)) def fit(self, X, y): # Parse input if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: self.feature_encoder_ = CustomOrdinalFeatureEncoder( n_intervals=self.n_intervals) self.class_encoder_ = CustomLabelEncoder() X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) # Reset the stored results for new fit self.reset_evaluation() # Generate rank if self.use_graph: # Construct the minimum graph and create rank graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph( X, y, ("AND", "OR", "XOR")) self.all_feature_constructors = graph.get_rank() elif self.prune is not None: # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y) feature_combinations = list( combinations(list(range(X.shape[1])), 2)) + [(i, i) for i in range(X.shape[1])] rank_pairs = [ symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y) for i, j in feature_combinations ] rank_pairs_index = np.argsort(rank_pairs)[::-1] # Create the unsorted list self.all_feature_constructors = [] for index in rank_pairs_index[:self.prune]: i, j = feature_combinations[index] if i == j: from tfg.feature_construction import create_feature self.all_feature_constructors.extend([ create_feature("OR", [(i, n), (i, m)]) for n, m in combinations(np.unique(X[:, i]), 2) ]) else: self.all_feature_constructors.extend( construct_features(X[:, [i, j]], operators=self.operators, same_feature=False)) else: # Create the unsorted list of all features self.all_feature_constructors = construct_features( X, operators=self.operators) if self.verbose: print( f"Total number of constructed features: {len(self.all_feature_constructors)}" ) self.all_feature_constructors.extend( [DummyFeatureConstructor(j) for j in range(X.shape[1])]) self.symmetrical_uncertainty_rank = [] # Sort the ranking for feature_constructor in self.all_feature_constructors: feature = feature_constructor.transform(X) su = symmetrical_uncertainty(f1=feature.flatten(), f2=y) self.symmetrical_uncertainty_rank.append(su) # Store the descending order index self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1] # If the initial variables are if self.use_initials: classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) classifier.fit(X, y) current_features = [ DummyFeatureConstructor(j) for j in range(X.shape[1]) ] # Store the backward result to reuse it for other executions self.initial_backward_features = backward_search( X, y, current_features, classifier) # Feature Subset Selection (FSS) from the rank self.filter_features(X, y) return self def predict(self, X): X, _ = self.transform(X) if self.encode_data: return self.class_encoder_.inverse_transform( self.classifier.predict(X)) return self.classifier.predict(X) def reset_evaluation(self): # Reset the memoize evaluations self.evaluate_leave_one_out_cross_val = memoize(evaluate_leave_one_out) def predict_proba(self, X): X, _ = self.transform(X) return self.classifier.predict_proba(X) def score(self, X, y): X, y = self.transform(X, y) return self.classifier.score(X, y) def filter_features(self, X, y): '''After the rank is built this perform the greedy wrapper search''' check_is_fitted(self) self.classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) current_score = np.NINF first_iteration = True current_features = [] current_data = None if self.use_initials: # Original Features have already been taken into account rank_iter = filter( lambda x: not isinstance(self.all_feature_constructors[x], DummyFeatureConstructor), iter(self.rank)) # Deep copy to avoid issues when modifying the list current_features = deepcopy(self.initial_backward_features) current_data = np.concatenate( [f.transform(X) for f in current_features], axis=1) # Get initial LOO score current_score = self.evaluate_leave_one_out_cross_val( self.classifier, current_features, current_data, y, fit=True) else: # Iterator over the sorted list of indexes rank_iter = iter(self.rank) if self.verbose: progress_bar = tqdm(total=len(self.rank), bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') iteration = 0 iterations_without_improvements = 0 # Loop for including {block size} elements at a time # Rank is an iterator, so the for loop is not sequential! for feature_constructor_index in rank_iter: iteration += 1 if self.verbose: progress_bar.set_postfix({ "n_features": len(current_features), "score": current_score }) progress_bar.update(1) progress_bar.refresh() # Add block size features new_X = [ self.all_feature_constructors[feature_constructor_index]. transform(X) ] selected_features = [ self.all_feature_constructors[feature_constructor_index] ] for _ in range(self.block_size - 1): try: index = next(rank_iter) selected_features.append( self.all_feature_constructors[index]) new_X.append( self.all_feature_constructors[index].transform(X)) if self.verbose: progress_bar.update(1) progress_bar.refresh() except: # Block size does not divide the number of elements in the rank. The search is halted break # Evaluate features new_X = np.concatenate(new_X, axis=1) if iteration == 1 and not self.use_initials: current_data = new_X current_score = self.evaluate_leave_one_out_cross_val( self.classifier, selected_features, current_data, y, fit=True) current_features = selected_features first_iteration = False if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break continue data = np.concatenate([current_data, new_X], axis=1) self.classifier.add_features(new_X, y) # LOO evaluation score = self.evaluate_leave_one_out_cross_val(self.classifier, current_features + selected_features, data, y, fit=False) if score > current_score: current_score = score current_data = data current_features.extend(selected_features) iterations_without_improvements = 0 else: iterations_without_improvements += 1 # Remove last added block for feature_index_to_remove in range( data.shape[1], data.shape[1] - new_X.shape[1], -1): self.classifier.remove_feature(feature_index_to_remove - 1) if self.strategy == "eager" and self.max_err < iterations_without_improvements: # Stops as soon as no impovement break if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break if self.verbose: progress_bar.close() print( f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}" ) self.final_feature_constructors = current_features return self def transform(self, X, y=None): check_is_fitted(self) if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: X = self.feature_encoder_.transform(X) if y is not None: y = self.class_encoder_.transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() new_X = [] for feature_constructor in self.final_feature_constructors: new_X.append(feature_constructor.transform(X)) return np.concatenate(new_X, axis=1), y
def simple_evaluate(self, individual, X, y): classifier_ = NaiveBayes(encode_data=False, metric=self.metric) return classifier_.leave_one_out_cross_val(transform_features( individual[0] + individual[1], X), y, fit=True)
def explore(self, X, y, graph, random_generator, parallel, max_errors=0): ''' Search method that follows the following steps: 1. The initial node is connected to all the others (roulette wheel selection is performed) 2. There are 2 type of nodes (corresponding to an original feature (2.1) or corresponding to a value of a feature (2.2)): 2.1. If the selected node is an original feature we add it to the selected subset and go to step 3. 2.2. If the selected node is part of a logical feature then we select another node (the CONSTRUCTION step will not return full original features) 3. Compute the score 3.1. If it improves the previous one 3.1.1 Add the feature to the current subset 3.1.2 Update the score 3.1.3 Select another node (SELECTION step) 3.1.4 Go to step 2 3.2. If not, the exploration ends Note: Threading does not speed up the calculations as they are CPU bound and in python only I/O operations will benefit from this parallelism GPU improvement would reduce the time of the exploration. ''' self.step = math.ceil(math.log2(X.shape[1])) self.current_features = [] selected_nodes = set() constructed_nodes = set() classifier = NaiveBayes(encode_data=False, metric=self.metric) current_score = np.NINF score = 0 if self.use_initials: self.current_features = [ DummyFeatureConstructor(j) for j in range(X.shape[1]) ] classifier.fit(X, y) current_transformed_features_numpy = np.concatenate( [f.transform(X) for f in self.current_features], axis=1) score = self.evaluate_loo(self.current_features, classifier, current_transformed_features_numpy, y) current_score = score selected_nodes.update(graph.get_original_ids()) if len(self.current_features) == 0: current_transformed_features_numpy = None initial, pheromones, heuristics = graph.get_initial_nodes( selected_nodes) probabilities = self.compute_probability(pheromones, heuristics) index = self.choose_next(probabilities, random_generator) node_id, selected_node = initial[index] # SU variable contains the MIFS-SU for the selected variable current_su = 0 su = heuristics[index] is_fitted = self.use_initials feature_constructor = None n_errors = 0 number_steps = 1 while True: current_score = score if selected_node[1] is None: # Original Feature feature_constructor = DummyFeatureConstructor(selected_node[0]) selected_nodes.add(node_id) else: # Need to construct next feature and compute heuristic value for the feature to replace temporal su from half-var neighbours, pheromones = graph.get_neighbours( selected_node, constructed_nodes, step="CONSTRUCTION") if len(neighbours) == 0: break if self.beta != 0: if parallel: with concurrent.futures.ThreadPoolExecutor( ) as executor: futures = [] for neighbour in neighbours: futures.append( executor.submit( self.compute_neighbour_sufs, neighbour=neighbour, transformed_features= current_transformed_features_numpy, constructors=self.current_features, selected_node=selected_node, current_su=current_su, X=X, y=y)) concurrent.futures.wait( futures, timeout=None, return_when='ALL_COMPLETED') su = [future.result() for future in futures] else: su = [ self.compute_neighbour_sufs( neighbour=neighbour, transformed_features= current_transformed_features_numpy, selected_node=selected_node, constructors=self.current_features, current_su=current_su, X=X, y=y) for neighbour in neighbours ] else: #Avoid unnecessary evaluation su = np.ones(len(neighbours)) probabilities = self.compute_probability( pheromones, np.array(su)) index = self.choose_next(probabilities, random_generator) su = su[index] feature_constructor = create_feature( neighbours[index][2], [selected_node, neighbours[index][1]]) constructed_nodes.add( frozenset( (node_id, neighbours[index][0], neighbours[index][2]))) node_id, selected_node = neighbours[index][:2] # Assess new feature transformed_feature = feature_constructor.transform(X) if is_fitted: classifier.add_features(transformed_feature, y) else: classifier.fit(transformed_feature, y) is_fitted = True if current_transformed_features_numpy is None: current_transformed_features_numpy = transformed_feature else: current_transformed_features_numpy = append_column_to_numpy( current_transformed_features_numpy, transformed_feature) if number_steps >= self.step: score = self.evaluate_loo( self.current_features + [feature_constructor], classifier, current_transformed_features_numpy, y) if score <= current_score: if n_errors >= max_errors: break else: n_errors += 1 else: n_errors = 0 number_steps = 0 else: number_steps += 1 current_su = su self.current_features.append(feature_constructor) current_score = score # Select next neighbours, pheromones = graph.get_neighbours(selected_node, selected_nodes, step="SELECTION") # Compute heuristic su = [] if len(neighbours) == 0: break if self.beta != 0: for neighbour, pheromone in zip(neighbours, pheromones): if neighbour[1][1] is None: # Original variable su.append( self.compute_sufs_cached( current_su, current_transformed_features_numpy, X[:, neighbour[1][0]], self.current_features, DummyFeatureConstructor(neighbour[1][0]), y, minimum=0)) else: # This is a temporal variable that will not be finally selected but only used to calculate the heuristic su.append( self.compute_sufs_cached( current_su, current_transformed_features_numpy, X[:, neighbour[1][0]] == neighbour[1][1], self.current_features, FeatureOperand(feature_index=neighbour[1][0], value=neighbour[1][1]), y, minimum=0)) else: su = np.ones(len(neighbours)) probabilities = self.compute_probability(pheromones, np.array(su)) index = self.choose_next(probabilities, random_generator) su = su[index] node_id, selected_node = neighbours[index][:2] if current_transformed_features_numpy.shape[1] > len( self.current_features): current_transformed_features_numpy = np.delete( current_transformed_features_numpy, -1, axis=1) self.final_score = self.evaluate_loo( self.current_features, classifier, current_transformed_features_numpy, y) return self.final_score