def acfs_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), verbose=True): # List to store results and column names for the csv result = [] columns = [ "Database", "Number of attributes", "NBScore", "NBScore STD", "ACFCS Score", "ACFCS Score STD", "Configuration", "Nodes", "Contruction Matrix", "Selection Matrix", "Selected_attributes", "Original" ] dataset_tqdm = tqdm(datasets) # Instantiate the classifier acfcs = ACFCS(verbose=0, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) # Update progressbar dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_score = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_selection_matrix = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_construction_matrix = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_nodes = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_dummy = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_selected = np.zeros(shape=(len(params), n_splits * n_repeats)) # Create splits for the experiments rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits * n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}' ) if verbose else rskf.split(X, y) # Execute experiments for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Encode the data c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers reusing info to speed up evaluation nb.fit(X_train, y_train) naive_bayes_score = nb.score(X_test, y_test) acfcs.reset_cache() for conf_index, conf in enumerate(params): acfcs.set_params(**conf) acfcs.fit(X_train, y_train, init_graph=conf_index == 0) # score acfcs_score_conf = acfcs.score(X_test, y_test) if verbose: seed_tqdm.set_postfix({ "config": conf_index, "nb_score": naive_bayes_score, "ant_score": acfcs_score_conf }) # Get data n_original_features = len( list( filter( lambda x: isinstance(x, DummyFeatureConstructor), acfcs.best_features))) n_selected = len(acfcs.best_features) selection_matrix = len(acfcs.afg.pheromone_selection) construction_matrix = len(acfcs.afg.pheromone_construction) nodes = len(acfcs.afg.nodes) # Update nb_score[conf_index, i] = naive_bayes_score acfcs_score[conf_index, i] = acfcs_score_conf acfcs_selection_matrix[conf_index, i] = selection_matrix acfcs_construction_matrix[conf_index, i] = construction_matrix acfcs_nodes[conf_index, i] = nodes acfcs_dummy[conf_index, i] = n_original_features acfcs_selected[conf_index, i] = n_selected # Insert the final result - averaged metrics for this database. for conf_index, conf in enumerate(params): row = [ name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(acfcs_score[conf_index]), np.std(acfcs_score[conf_index]), conf, np.mean(acfcs_nodes[conf_index]), np.mean(acfcs_construction_matrix[conf_index]), np.mean(acfcs_selection_matrix[conf_index]), np.mean(acfcs_selected[conf_index]), np.mean(acfcs_dummy[conf_index]) ] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results("ACFCS", email_data, result) return result
def genetic_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), verbose=True, version=1): result = [] columns = [ "Database", "Number of attributes", "NBScore", "NBScore STD", "Genetic Score", "Genetic Score STD", "Configuration", "Selected_attributes", "Original" ] dataset_tqdm = tqdm(datasets) # Instantiate the classifier if version == 1: # First Version - No flexibility in the number of attributes (bad performance) # clf = GeneticProgramming(seed=seed, metric=metric) clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric) elif version == 2: # Version with flexibility clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric) else: # Guided mutation based on SU clf = GeneticProgrammingRankMutation(seed=seed, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_score = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_selected = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_dummy = np.zeros(shape=(len(params), n_splits * n_repeats)) # Create splits for the experiments rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits * n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}' ) if verbose else rskf.split(X, y) # Execute experiments for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Encode the data c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers reusing info to speed up evaluation nb.fit(X_train, y_train) naive_bayes_score = nb.score(X_test, y_test) # Reset evaluation-cache for new split clf.reset_evaluation() for conf_index, conf in enumerate(params): if verbose: seed_tqdm.set_postfix({"config": conf_index}) clf.set_params(**conf) clf.fit(X_train, y_train) # score genetic_score = clf.score(X_test, y_test) # Get data n_original_features = len( list( filter( lambda x: isinstance(x, DummyFeatureConstructor), clf.best_features))) n_selected = len(clf.best_features) # Update nb_score[conf_index, i] = naive_bayes_score clf_score[conf_index, i] = genetic_score clf_selected[conf_index, i] = n_selected clf_dummy[conf_index, i] = n_original_features # Insert to final result averaged metrics for this database for conf_index, conf in enumerate(params): row = [ name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(clf_score[conf_index]), np.std(clf_score[conf_index]), conf, np.mean(clf_selected[conf_index]), np.mean(clf_dummy[conf_index]) ] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results(f"GENETIC_{version}", email_data, result) return result
def ranker_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), share_rank=True): result = [] columns = ["Database", "Number of attributes", "NBScore", "NBScore STD", "Ranker Score", "Ranker Score STD", "Configuration", "Combinations", "Selected_attributes", "Original"] dataset_tqdm = tqdm(datasets) # Instantiate the classifier r = RankerLogicalFeatureConstructor(n_intervals=n_intervals, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits*n_repeats)) r_score = np.zeros(shape=(len(params), n_splits*n_repeats)) r_combinations = np.zeros(shape=(len(params), n_splits*n_repeats)) r_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) r_dummy = np.zeros(shape=(len(params), n_splits*n_repeats)) r_total_constructed = np.zeros(shape=(len(params), n_splits*n_repeats)) r_total_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) r_original_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits*n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers nb.fit(X=X_train, y=y_train) naive_bayes_score = nb.score(X_test, y_test) for conf_index, conf in enumerate(params): seed_tqdm.set_postfix({"config": conf_index}) r.set_params(**conf) # Fit if conf_index == 0 or not share_rank: # The rank is computed from scratch r.fit(X_train, y_train) else: r.filter_features(r.feature_encoder_.transform( X_train), r.class_encoder_.transform(y_train)) # score ranker_score = r.score(X_test, y_test) # Get data n_original_features = len(list(filter(lambda x: isinstance( x, DummyFeatureConstructor), r.final_feature_constructors))) n_combinations = len(r.all_feature_constructors) n_selected = len(r.final_feature_constructors) # Update nb_score[conf_index, i] = naive_bayes_score r_score[conf_index, i] = ranker_score r_combinations[conf_index, i] = n_combinations r_selected[conf_index, i] = n_selected r_dummy[conf_index, i] = n_original_features # Insert to final result averaged metrics for this dataset for conf_index, conf in enumerate(params): row = [name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(r_score[conf_index]), np.std(r_score[conf_index]), conf, np.mean(r_combinations[conf_index]), np.mean(r_selected[conf_index]), np.mean(r_dummy[conf_index])] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results("RANKER", email_data, result) return result
def scoring_comparison(base_path,datasets,verbose=1,test_size=0.3,seed=None,n_iterations=30): column_names = ["dataset", "custom_training_score", "custom_test_score", "categorical_training_score", "categorical_test_score"] data =[] clf_no_encoding = NaiveBayes(encode_data=True) clf_categorical_sklearn = CategoricalNB() datasets_iter = tqdm(datasets, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') c = CustomOrdinalFeatureEncoder() l = CustomLabelEncoder() for dataset in datasets_iter: dataset_name, label = dataset data_filename = f"{dataset_name}.data.csv" test_filename = f"{dataset_name}.test.csv" X, y = get_X_y_from_database(base_path=base_path, name = dataset_name, data = data_filename, test = test_filename, label = label) custom_train = [] custom_test = [] sklearn_train = [] sklearn_test = [] X = c.fit_transform(X) y = l.fit_transform(y) for iteration in range(n_iterations): if verbose: datasets_iter.set_postfix({"Dataset": dataset_name, "seed":iteration}) datasets_iter.refresh() try: X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=test_size, random_state=seed+iteration, shuffle=True, stratify=y) except: #Not enough values to stratify y X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=test_size, random_state=seed+iteration, shuffle=True ) #Fit clf_no_encoding.fit(X_train,y_train) clf_categorical_sklearn.min_categories = [1+np.max(np.concatenate([X_train[:,j],X_test[:,j]])) for j in range(X_train.shape[1])] clf_categorical_sklearn.fit(X_train,y_train) #Predict custom_train.append(clf_no_encoding.score(X_train,y_train)) custom_test.append(clf_no_encoding.score(X_test,y_test)) sklearn_train.append(clf_categorical_sklearn.score(X_train,y_train)) sklearn_test.append(clf_categorical_sklearn.score(X_test,y_test)) data.append([dataset_name,np.mean(custom_train),np.mean(custom_test),np.mean(sklearn_train),np.mean(sklearn_test)]) return pd.DataFrame(data,columns = column_names)