Ejemplo n.º 1
0
    def fit(self, X, y):
        self.feature_encoder_ = CustomOrdinalFeatureEncoder()
        self.class_encoder_ = CustomLabelEncoder()

        if isinstance(X, pd.DataFrame):
            self.categories_ = X.columns
        if self.encode_data:
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        classifier_ = NaiveBayes(encode_data=False,
                                 n_intervals=self.n_intervals,
                                 metric=self.metric)
        self.n_features = X.shape[1]
        if self.encode_data:
            self.unique_values = [
                values.shape[0] for values in self.feature_encoder_.categories_
            ]
        else:
            self.unique_values = [
                np.unique(X[:, j]).shape[0] for j in range(X.shape[1])
            ]
        random.seed(self.seed)
        np.random.seed(self.seed)
        self.size = np.ceil(np.sqrt(X.shape[1]))
        best_individual = self.execute_algorithm(X, y)
        self.best_features = best_individual
        self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
        self.classifier_.fit(
            np.concatenate(
                [feature.transform(X) for feature in self.best_features],
                axis=1), y)
        return self
Ejemplo n.º 2
0
def test_incremental_validation(X=None, y=None, iterations=10, verbose=1):
    if not X:
        X, y = make_classification(n_samples=500,
                                   n_features=1000,
                                   n_informative=20,
                                   n_redundant=1,
                                   n_repeated=0,
                                   n_classes=2,
                                   n_clusters_per_class=2,
                                   weights=None,
                                   class_sep=1,
                                   hypercube=False,
                                   scale=1.0,
                                   shuffle=True,
                                   random_state=0)
    X //= 10  # --> To be able to evaluate categoricalNB

    # classifiers
    nb_classifier = NaiveBayes(encode_data=True)
    nb_classifier_no_encoding = NaiveBayes(encode_data=False)
    custom_encoder = CustomOrdinalFeatureEncoder()
    cnb = CategoricalNB()

    # accumulators
    categorical_nb = []
    custom_nb_val_1 = []
    custom_nb_val_2 = []
    custom_nb_val_3 = []
    custom_nb_val_4 = []
    for i in range(iterations):
        if verbose:
            print(f"Iteration {i}")
        ts = time()
        X2 = custom_encoder.fit_transform(X)

        ts = time()
        score_2 = nb_classifier.leave_one_out_cross_val(X, y)
        custom_nb_val_1.append(time() - ts)

        ts = time()
        score_4 = cross_leave_one_out(nb_classifier, X, y)
        custom_nb_val_3.append(time() - ts)

        ts = time()
        X2 = custom_encoder.fit_transform(X)
        score_5 = cross_leave_one_out(nb_classifier_no_encoding, X2, y)
        custom_nb_val_4.append(time() - ts)

        if i == 0:
            score_1 = score_2
            scores = [score_1, score_2, score_4, score_5]
            assert all(score == scores[0] for score in scores)
    print("Categorical with scikit loo: ", np.mean(categorical_nb[1:]))
    print("Custom with scikit loo: ", np.mean(custom_nb_val_3[1:]))
    print("Custom with scikit loo (pre-encoding): ",
          np.mean(custom_nb_val_4[1:]))
    print("Custom with first incremental: ", np.mean(custom_nb_val_1[1:]))
Ejemplo n.º 3
0
def test_remove_feature():

    X, y = make_classification(n_samples=1000,
                               n_features=100,
                               n_informative=2,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=2,
                               n_clusters_per_class=1,
                               weights=None,
                               class_sep=1.0,
                               hypercube=True,
                               scale=2.0,
                               shuffle=True,
                               random_state=0)
    nb = CustomNaiveBayes(encode_data=True)
    nb.fit(X, y)
    nb.remove_feature(0)
    independent = nb.indepent_term_
    smoothed_log_counts_ = nb.smoothed_log_counts_
    removed = nb.predict_proba(np.delete(X, 0, axis=1))
    nb.fit(np.delete(X, 0, axis=1), y)
    og = nb.predict_proba(np.delete(X, 0, axis=1))
    assert np.allclose(nb.smoothed_log_counts_, smoothed_log_counts_)
    assert np.allclose(nb.indepent_term_, independent)
    assert np.allclose(og, removed)
Ejemplo n.º 4
0
def test_add_features_with_index():
    X, y = make_classification(n_samples=1000,
                               n_features=100,
                               n_informative=2,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=2,
                               n_clusters_per_class=1,
                               weights=None,
                               class_sep=1.0,
                               hypercube=True,
                               scale=2.0,
                               shuffle=True,
                               random_state=0)
    X_og = X.copy()
    index = [0, 8, 9, 20]
    X_two_less = np.delete(X_og, index, axis=1)
    nb = CustomNaiveBayes(encode_data=True)
    nb.fit(X_two_less, y)
    nb.add_features(X_og[:, index], y, index=index)
    independent = nb.indepent_term_
    smoothed_log_counts_ = nb.smoothed_log_counts_
    added = nb.predict_proba(X)

    nb.fit(X, y)
    og = nb.predict_proba(X)
    assert np.allclose(nb.indepent_term_, independent)
    assert np.allclose(nb.smoothed_log_counts_, smoothed_log_counts_)
    assert np.allclose(og, added)
Ejemplo n.º 5
0
def time_comparison(combinations=None, n_iterations=15, verbose=1, seed=200):
    column_names = [
        "Classifier", "n_samples", "n_features", "Average Fit Time",
        "STD Fit Time", "Average Predict Time", "STD Predict Time", "Score"
    ]

    results = []
    if combinations is None:
        columns = range(10, 40010, 5000)
        rows = [10, 100, 1000]
        combinations = list(product(rows, columns)) + list(
            product(columns, rows))
        combinations += list(product([10, 100, 1000], [500000]))
        combinations += list(product([500000], [10, 100, 1000]))

    clf_no_encoding = NaiveBayes(encode_data=False, alpha=1)
    clf_encoding = NaiveBayes(encode_data=True, alpha=1, discretize=False)
    clf_categorical_sklearn = CategoricalNB(alpha=1)
    clf_gaussian_sklearn = GaussianNB()
    progress_bar = tqdm(total=len(combinations),
                        bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')
    X = []
    y = []
    for n_samples, n_features in combinations:
        if verbose:
            progress_bar.set_postfix({
                "n_samples": n_samples,
                "n_features": n_features
            })
            progress_bar.update(1)
            progress_bar.refresh()
        del X
        del y
        X, y = make_classification(n_samples=n_samples,
                                   n_features=n_features,
                                   flip_y=0.01,
                                   class_sep=1.0,
                                   hypercube=True,
                                   shift=0.0,
                                   scale=2.0,
                                   shuffle=True,
                                   random_state=seed)
        X = make_discrete(X, m=1)

        X_train, X_test, y_train, y_test = X, X, y, y
        gaussian_nb_fit_time = []
        gaussian_nb_predict_time = []
        gaussian_nb_score = []
        gaussian_nb_errors = 0

        categorical_nb_fit_time = []
        categorical_nb_predict_time = []
        categorical_nb_score = []
        categorical_nb_errors = 0

        custom_no_encoding_nb_fit_time = []
        custom_no_encoding_nb_predict_time = []
        custom_no_encoding_nb_score = []
        custom_no_encoding_nb_errors = 0

        custom_encoding_nb_fit_time = []
        custom_encoding_nb_predict_time = []
        custom_encoding_nb_score = []
        custom_encoding_nb_errors = 0

        for _ in range(n_iterations):
            gaussian_nb_errors += evaluate(X_train, y_train, X_test, y_test,
                                           clf_gaussian_sklearn,
                                           gaussian_nb_fit_time,
                                           gaussian_nb_predict_time,
                                           gaussian_nb_score)
            categorical_nb_errors += evaluate(X_train, y_train, X_test, y_test,
                                              clf_categorical_sklearn,
                                              categorical_nb_fit_time,
                                              categorical_nb_predict_time,
                                              categorical_nb_score)
            custom_no_encoding_nb_errors += evaluate(
                X_train, y_train, X_test, y_test, clf_no_encoding,
                custom_no_encoding_nb_fit_time,
                custom_no_encoding_nb_predict_time,
                custom_no_encoding_nb_score)
            custom_encoding_nb_errors += evaluate(
                X_train, y_train, X_test, y_test, clf_encoding,
                custom_encoding_nb_fit_time, custom_encoding_nb_predict_time,
                custom_encoding_nb_score)

        update_df(results, "Gaussian", n_samples, n_features,
                  gaussian_nb_fit_time, gaussian_nb_predict_time,
                  gaussian_nb_score, gaussian_nb_errors)
        update_df(results, "Categorical", n_samples, n_features,
                  categorical_nb_fit_time, categorical_nb_predict_time,
                  categorical_nb_score, categorical_nb_errors)

        update_df(results, "Custom with encoding", n_samples, n_features,
                  custom_encoding_nb_fit_time, custom_encoding_nb_predict_time,
                  custom_encoding_nb_score, custom_encoding_nb_errors)

        update_df(results, "Custom without encoding", n_samples, n_features,
                  custom_no_encoding_nb_fit_time,
                  custom_no_encoding_nb_predict_time,
                  custom_no_encoding_nb_score, custom_no_encoding_nb_errors)
        results_df = pd.DataFrame(results, columns=column_names)
        results_df.drop_duplicates(["Classifier", "n_samples", "n_features"],
                                   inplace=True)
        results_df.to_csv("backup.csv")
    return results_df
Ejemplo n.º 6
0
def acfs_score_comparison(datasets,
                          seed,
                          base_path,
                          params,
                          n_splits=3,
                          n_repeats=5,
                          n_intervals=5,
                          metric="accuracy",
                          send_email=False,
                          email_data=dict(),
                          verbose=True):
    # List to store results and column names for the csv
    result = []
    columns = [
        "Database", "Number of attributes", "NBScore", "NBScore STD",
        "ACFCS Score", "ACFCS Score STD", "Configuration", "Nodes",
        "Contruction Matrix", "Selection Matrix", "Selected_attributes",
        "Original"
    ]
    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    acfcs = ACFCS(verbose=0, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        # Update progressbar
        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_selection_matrix = np.zeros(shape=(len(params),
                                                 n_splits * n_repeats))
        acfcs_construction_matrix = np.zeros(shape=(len(params),
                                                    n_splits * n_repeats))
        acfcs_nodes = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_dummy = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_selected = np.zeros(shape=(len(params), n_splits * n_repeats))

        # Create splits for the experiments
        rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                       n_repeats=n_repeats,
                                       random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits * n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'
                         ) if verbose else rskf.split(X, y)

        # Execute experiments
        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Encode the data
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers reusing info to speed up evaluation
            nb.fit(X_train, y_train)
            naive_bayes_score = nb.score(X_test, y_test)
            acfcs.reset_cache()
            for conf_index, conf in enumerate(params):
                acfcs.set_params(**conf)
                acfcs.fit(X_train, y_train, init_graph=conf_index == 0)

                # score
                acfcs_score_conf = acfcs.score(X_test, y_test)
                if verbose:
                    seed_tqdm.set_postfix({
                        "config": conf_index,
                        "nb_score": naive_bayes_score,
                        "ant_score": acfcs_score_conf
                    })

                # Get data
                n_original_features = len(
                    list(
                        filter(
                            lambda x: isinstance(x, DummyFeatureConstructor),
                            acfcs.best_features)))
                n_selected = len(acfcs.best_features)
                selection_matrix = len(acfcs.afg.pheromone_selection)
                construction_matrix = len(acfcs.afg.pheromone_construction)
                nodes = len(acfcs.afg.nodes)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                acfcs_score[conf_index, i] = acfcs_score_conf
                acfcs_selection_matrix[conf_index, i] = selection_matrix
                acfcs_construction_matrix[conf_index, i] = construction_matrix
                acfcs_nodes[conf_index, i] = nodes
                acfcs_dummy[conf_index, i] = n_original_features
                acfcs_selected[conf_index, i] = n_selected

        # Insert the final result - averaged metrics for this database.
        for conf_index, conf in enumerate(params):
            row = [
                name, X.shape[1],
                np.mean(nb_score[conf_index]),
                np.std(nb_score[conf_index]),
                np.mean(acfcs_score[conf_index]),
                np.std(acfcs_score[conf_index]), conf,
                np.mean(acfcs_nodes[conf_index]),
                np.mean(acfcs_construction_matrix[conf_index]),
                np.mean(acfcs_selection_matrix[conf_index]),
                np.mean(acfcs_selected[conf_index]),
                np.mean(acfcs_dummy[conf_index])
            ]
            result.append(row)
    result = pd.DataFrame(result, columns=columns)

    if send_email:
        from tfg.utils import send_results
        send_results("ACFCS", email_data, result)
    return result
Ejemplo n.º 7
0
    def filter_features(self, X, y):
        '''After the rank is built this perform the greedy wrapper search'''
        check_is_fitted(self)
        self.classifier = NaiveBayes(encode_data=False,
                                     n_intervals=self.n_intervals,
                                     metric=self.metric)
        current_score = np.NINF
        first_iteration = True
        current_features = []
        current_data = None
        if self.use_initials:
            # Original Features have already been taken into account
            rank_iter = filter(
                lambda x: not isinstance(self.all_feature_constructors[x],
                                         DummyFeatureConstructor),
                iter(self.rank))

            # Deep copy to avoid issues when modifying the list
            current_features = deepcopy(self.initial_backward_features)
            current_data = np.concatenate(
                [f.transform(X) for f in current_features], axis=1)

            # Get initial LOO score
            current_score = self.evaluate_leave_one_out_cross_val(
                self.classifier, current_features, current_data, y, fit=True)
        else:
            # Iterator over the sorted list of indexes
            rank_iter = iter(self.rank)

        if self.verbose:
            progress_bar = tqdm(total=len(self.rank),
                                bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        iteration = 0
        iterations_without_improvements = 0

        # Loop for including {block size} elements at a time
        # Rank is an iterator, so the for loop is not sequential!
        for feature_constructor_index in rank_iter:
            iteration += 1
            if self.verbose:
                progress_bar.set_postfix({
                    "n_features": len(current_features),
                    "score": current_score
                })
                progress_bar.update(1)
                progress_bar.refresh()

            # Add block size features
            new_X = [
                self.all_feature_constructors[feature_constructor_index].
                transform(X)
            ]
            selected_features = [
                self.all_feature_constructors[feature_constructor_index]
            ]
            for _ in range(self.block_size - 1):
                try:
                    index = next(rank_iter)
                    selected_features.append(
                        self.all_feature_constructors[index])
                    new_X.append(
                        self.all_feature_constructors[index].transform(X))
                    if self.verbose:
                        progress_bar.update(1)
                        progress_bar.refresh()
                except:
                    # Block size does not divide the number of elements in the rank. The search is halted
                    break

            # Evaluate features
            new_X = np.concatenate(new_X, axis=1)
            if iteration == 1 and not self.use_initials:
                current_data = new_X
                current_score = self.evaluate_leave_one_out_cross_val(
                    self.classifier,
                    selected_features,
                    current_data,
                    y,
                    fit=True)
                current_features = selected_features
                first_iteration = False
                if self.max_iterations <= iteration or (
                        len(current_features) +
                        self.block_size) > self.max_features:
                    break
                continue
            data = np.concatenate([current_data, new_X], axis=1)
            self.classifier.add_features(new_X, y)
            # LOO evaluation
            score = self.evaluate_leave_one_out_cross_val(self.classifier,
                                                          current_features +
                                                          selected_features,
                                                          data,
                                                          y,
                                                          fit=False)
            if score > current_score:
                current_score = score
                current_data = data
                current_features.extend(selected_features)
                iterations_without_improvements = 0
            else:
                iterations_without_improvements += 1
                # Remove last added block
                for feature_index_to_remove in range(
                        data.shape[1], data.shape[1] - new_X.shape[1], -1):
                    self.classifier.remove_feature(feature_index_to_remove - 1)
                if self.strategy == "eager" and self.max_err < iterations_without_improvements:
                    # Stops as soon as no impovement
                    break

            if self.max_iterations <= iteration or (
                    len(current_features) +
                    self.block_size) > self.max_features:
                break
        if self.verbose:
            progress_bar.close()
            print(
                f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}"
            )
        self.final_feature_constructors = current_features
        return self
Ejemplo n.º 8
0
def scoring_comparison(base_path,datasets,verbose=1,test_size=0.3,seed=None,n_iterations=30):
    column_names = ["dataset",
                    "custom_training_score",
                    "custom_test_score",
                    "categorical_training_score",
                    "categorical_test_score"]
    data =[]
    clf_no_encoding = NaiveBayes(encode_data=True)
    clf_categorical_sklearn = CategoricalNB()
    
    datasets_iter = tqdm(datasets, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')
    c = CustomOrdinalFeatureEncoder()
    l = CustomLabelEncoder()
    
    for dataset in datasets_iter:
        dataset_name, label = dataset
        data_filename = f"{dataset_name}.data.csv"
        test_filename = f"{dataset_name}.test.csv"
        X, y = get_X_y_from_database(base_path=base_path,
                                     name = dataset_name,
                                     data = data_filename, 
                                     test = test_filename, 
                                     label = label)
        custom_train = []
        custom_test = []

        sklearn_train = []
        sklearn_test = []


        X  = c.fit_transform(X)
        y  = l.fit_transform(y)
        for iteration in range(n_iterations):
            if verbose:
                datasets_iter.set_postfix({"Dataset": dataset_name, "seed":iteration})
                datasets_iter.refresh()
            try:
                X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                             test_size=test_size,
                                                             random_state=seed+iteration,
                                                             shuffle=True,
                                                             stratify=y)
            except:
                #Not enough values to stratify y
                X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                                test_size=test_size,
                                                                random_state=seed+iteration,
                                                                shuffle=True
                                                                )
            #Fit
            clf_no_encoding.fit(X_train,y_train)
            clf_categorical_sklearn.min_categories = [1+np.max(np.concatenate([X_train[:,j],X_test[:,j]])) for j in range(X_train.shape[1])]
            clf_categorical_sklearn.fit(X_train,y_train)
            
            
            #Predict
            custom_train.append(clf_no_encoding.score(X_train,y_train))
            custom_test.append(clf_no_encoding.score(X_test,y_test))
            sklearn_train.append(clf_categorical_sklearn.score(X_train,y_train))
            sklearn_test.append(clf_categorical_sklearn.score(X_test,y_test))
        data.append([dataset_name,np.mean(custom_train),np.mean(custom_test),np.mean(sklearn_train),np.mean(sklearn_test)])
    return pd.DataFrame(data,columns = column_names)
Ejemplo n.º 9
0
def ranker_score_comparison(datasets,
                            seed,
                            base_path,
                            params,
                            n_splits=3,
                            n_repeats=5,
                            n_intervals=5,
                            metric="accuracy",
                            send_email=False,
                            email_data=dict(),
                            share_rank=True):
    result = []
    columns = ["Database",
               "Number of attributes",
               "NBScore",
               "NBScore STD",
               "Ranker Score",
               "Ranker Score STD",
               "Configuration",
               "Combinations",
               "Selected_attributes",
               "Original"]

    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    r = RankerLogicalFeatureConstructor(n_intervals=n_intervals, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_score = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_combinations = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_selected = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_dummy = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_total_constructed = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_total_selected = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_original_selected = np.zeros(shape=(len(params), n_splits*n_repeats))

        rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits*n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers
            nb.fit(X=X_train, y=y_train)
            naive_bayes_score = nb.score(X_test, y_test)

            for conf_index, conf in enumerate(params):
                seed_tqdm.set_postfix({"config": conf_index})
                r.set_params(**conf)
                # Fit
                if conf_index == 0 or not share_rank:
                    # The rank is computed from scratch
                    r.fit(X_train, y_train)
                else:
                    r.filter_features(r.feature_encoder_.transform(
                        X_train), r.class_encoder_.transform(y_train))

                # score
                ranker_score = r.score(X_test, y_test)

                # Get data
                n_original_features = len(list(filter(lambda x: isinstance(
                    x, DummyFeatureConstructor), r.final_feature_constructors)))
                n_combinations = len(r.all_feature_constructors)
                n_selected = len(r.final_feature_constructors)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                r_score[conf_index, i] = ranker_score
                r_combinations[conf_index, i] = n_combinations
                r_selected[conf_index, i] = n_selected
                r_dummy[conf_index, i] = n_original_features

        # Insert to final result averaged metrics for this dataset
        for conf_index, conf in enumerate(params):
            row = [name,
                   X.shape[1],
                   np.mean(nb_score[conf_index]),
                   np.std(nb_score[conf_index]),
                   np.mean(r_score[conf_index]),
                   np.std(r_score[conf_index]),
                   conf,
                   np.mean(r_combinations[conf_index]),
                   np.mean(r_selected[conf_index]),
                   np.mean(r_dummy[conf_index])]
            result.append(row)
    result = pd.DataFrame(result, columns=columns)
    if send_email:
        from tfg.utils import send_results
        send_results("RANKER", email_data, result)
    return result
Ejemplo n.º 10
0
def genetic_score_comparison(datasets,
                             seed,
                             base_path,
                             params,
                             n_splits=3,
                             n_repeats=5,
                             n_intervals=5,
                             metric="accuracy",
                             send_email=False,
                             email_data=dict(),
                             verbose=True,
                             version=1):
    result = []
    columns = [
        "Database", "Number of attributes", "NBScore", "NBScore STD",
        "Genetic Score", "Genetic Score STD", "Configuration",
        "Selected_attributes", "Original"
    ]

    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    if version == 1:
        # First Version - No flexibility in the number of attributes (bad performance)
        # clf = GeneticProgramming(seed=seed, metric=metric)
        clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric)
    elif version == 2:
        # Version with flexibility
        clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric)
    else:
        # Guided mutation based on SU
        clf = GeneticProgrammingRankMutation(seed=seed, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_selected = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_dummy = np.zeros(shape=(len(params), n_splits * n_repeats))

        # Create splits for the experiments
        rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                       n_repeats=n_repeats,
                                       random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits * n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'
                         ) if verbose else rskf.split(X, y)

        # Execute experiments
        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Encode the data
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers reusing info to speed up evaluation
            nb.fit(X_train, y_train)
            naive_bayes_score = nb.score(X_test, y_test)

            # Reset evaluation-cache for new split
            clf.reset_evaluation()
            for conf_index, conf in enumerate(params):
                if verbose:
                    seed_tqdm.set_postfix({"config": conf_index})
                clf.set_params(**conf)
                clf.fit(X_train, y_train)

                # score
                genetic_score = clf.score(X_test, y_test)

                # Get data
                n_original_features = len(
                    list(
                        filter(
                            lambda x: isinstance(x, DummyFeatureConstructor),
                            clf.best_features)))
                n_selected = len(clf.best_features)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                clf_score[conf_index, i] = genetic_score
                clf_selected[conf_index, i] = n_selected
                clf_dummy[conf_index, i] = n_original_features

        # Insert to final result averaged metrics for this database
        for conf_index, conf in enumerate(params):
            row = [
                name, X.shape[1],
                np.mean(nb_score[conf_index]),
                np.std(nb_score[conf_index]),
                np.mean(clf_score[conf_index]),
                np.std(clf_score[conf_index]), conf,
                np.mean(clf_selected[conf_index]),
                np.mean(clf_dummy[conf_index])
            ]
            result.append(row)

    result = pd.DataFrame(result, columns=columns)
    if send_email:
        from tfg.utils import send_results
        send_results(f"GENETIC_{version}", email_data, result)
    return result
Ejemplo n.º 11
0
    def fit(self, X, y, init_graph=True):
        self.feature_encoder_ = CustomOrdinalFeatureEncoder()
        self.class_encoder_ = CustomLabelEncoder()

        self.categories_ = None
        if isinstance(X, pd.DataFrame):
            self.categories_ = X.columns
        if self.encode_data:
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)
        if init_graph:
            if self.graph_strategy == "full":
                #Full graph
                self.afg = AntFeatureGraph(seed=self.seed).compute_graph(
                    X, y, ("XOR", "OR", "AND"))
            else:
                #Pruned graph
                self.afg = AntFeatureGraphMI(
                    seed=self.seed,
                    connections=self.connections).compute_graph(
                        X, y, ("XOR", "OR", "AND"))
        else:
            self.afg.reset_pheromones()
        if self.verbose:
            print(f"Number of nodes: {len(self.afg.nodes)}")

        random.seed(self.seed)
        best_score = 0
        self.best_features = []
        iterations_without_improvement = 0
        iterator = tqdm(range(self.iterations)) if self.verbose else range(
            self.iterations)
        beta = self.beta
        distance_from_best = -1
        for iteration in iterator:
            if self.verbose:
                iterator.set_postfix({
                    "best_score":
                    best_score,
                    "n_features":
                    len(self.best_features),
                    "p_matrix_c":
                    len(self.afg.pheromone_construction),
                    "p_matrix_s":
                    len(self.afg.pheromone_selection),
                    "distance_from_best":
                    distance_from_best
                })
            ants = [
                Ant(ant_id=i,
                    alpha=self.alpha,
                    beta=beta,
                    metric=self.metric,
                    use_initials=self.use_initials,
                    cache_loo=self.cache_loo,
                    cache_heuristic=self.cache_heuristic,
                    step=self.step) for i in range(self.ants)
            ]
            beta *= (1 - self.beta_evaporation_rate)
            results = []
            for ant in ants:
                results.append(
                    ant.run(X=X,
                            y=y,
                            graph=self.afg,
                            random_generator=random,
                            parallel=self.parallel,
                            max_errors=self.max_errors))
            results = np.array(results)

            self.afg.update_pheromone_matrix_evaporation(self.evaporation_rate)
            distance_from_best = np.mean(np.abs(results - best_score))
            best_ant = np.argmax(results)
            if self.update_strategy == "best":
                ant = ants[best_ant]
                self.afg.intensify(ant.current_features,
                                   self.intensification_factor, 1,
                                   self.use_initials)
            else:
                for ant_score, ant in zip(results, ants):
                    self.afg.intensify(ant.current_features,
                                       self.intensification_factor, ant_score,
                                       self.use_initials)

            if results[best_ant] >= best_score:
                iterations_without_improvement = 0
                ant = ants[best_ant]
                best_score = results[best_ant]
                self.best_features = ant.current_features
            else:
                iterations_without_improvement += 1
                if iterations_without_improvement > self.early_stopping:
                    break

        self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
        if self.final_selection == "BEST":
            pass
        else:
            #An ant traverses the graph deterministically to obtain the features
            final_ant = FinalAnt(ant_id=0,
                                 alpha=self.alpha,
                                 beta=beta,
                                 metric=self.metric,
                                 use_initials=self.use_initials,
                                 cache_loo=self.cache_loo,
                                 cache_heuristic=self.cache_heuristic,
                                 step=self.step)
            final_ant.run(X=X,
                          y=y,
                          graph=self.afg,
                          random_generator=random,
                          parallel=self.parallel)
            self.best_features = final_ant.current_features
        #Train model with final features
        self.classifier_.fit(
            np.concatenate(
                [feature.transform(X) for feature in self.best_features],
                axis=1), y)

        if self.save_features:
            #Save to features to dict
            translate_features(features=self.best_features,
                               feature_encoder=self.feature_encoder_,
                               categories=self.categories_,
                               path=self.path,
                               filename=self.filename)
        return self
Ejemplo n.º 12
0
class ACFCS(OptimizationMixin, TransformerMixin, ClassifierMixin,
            BaseEstimator):
    def __init__(self,
                 ants=10,
                 evaporation_rate=0.05,
                 intensification_factor=0.05,
                 alpha=1.0,
                 beta=0.0,
                 beta_evaporation_rate=0.05,
                 step=1,
                 iterations=100,
                 early_stopping=20,
                 update_strategy="best",
                 seed=None,
                 parallel=False,
                 save_features=False,
                 path=None,
                 filename=None,
                 verbose=0,
                 graph_strategy="mutual_info",
                 connections=2,
                 max_errors=0,
                 metric="accuracy",
                 use_initials=False,
                 final_selection="ALL",
                 encode_data=True):
        self.step = step
        self.ants = ants
        self.evaporation_rate = evaporation_rate
        self.intensification_factor = intensification_factor
        self.alpha = alpha
        self.beta = beta
        self.beta_evaporation_rate = beta_evaporation_rate
        self.iterations = iterations
        self.early_stopping = early_stopping
        self.seed = seed
        self.parallel = parallel
        self.save_features = save_features
        self.path = path
        self.filename = filename
        self.verbose = verbose
        self.graph_strategy = graph_strategy
        self.connections = connections
        self.metric = metric
        self.update_strategy = update_strategy
        self.use_initials = use_initials
        self.final_selection = final_selection
        self.encode_data = encode_data
        self.max_errors = max_errors
        allowed_graph_strategy = ("full", "mutual_info")
        if self.graph_strategy not in allowed_graph_strategy:
            raise ValueError(
                "Unknown graph strategy type: %s, expected one of %s." %
                (self.graph_strategy, allowed_graph_strategy))

        allowed_update_strategy = ("all", "best")
        if self.update_strategy not in allowed_update_strategy:
            raise ValueError(
                "Unknown graph strategy type: %s, expected one of %s." %
                (self.update_strategy, allowed_update_strategy))

        self.reset_cache()

    def reset_cache(self):
        self.cache_loo = dict()
        self.cache_heuristic = dict()

    def fit(self, X, y, init_graph=True):
        self.feature_encoder_ = CustomOrdinalFeatureEncoder()
        self.class_encoder_ = CustomLabelEncoder()

        self.categories_ = None
        if isinstance(X, pd.DataFrame):
            self.categories_ = X.columns
        if self.encode_data:
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)
        if init_graph:
            if self.graph_strategy == "full":
                #Full graph
                self.afg = AntFeatureGraph(seed=self.seed).compute_graph(
                    X, y, ("XOR", "OR", "AND"))
            else:
                #Pruned graph
                self.afg = AntFeatureGraphMI(
                    seed=self.seed,
                    connections=self.connections).compute_graph(
                        X, y, ("XOR", "OR", "AND"))
        else:
            self.afg.reset_pheromones()
        if self.verbose:
            print(f"Number of nodes: {len(self.afg.nodes)}")

        random.seed(self.seed)
        best_score = 0
        self.best_features = []
        iterations_without_improvement = 0
        iterator = tqdm(range(self.iterations)) if self.verbose else range(
            self.iterations)
        beta = self.beta
        distance_from_best = -1
        for iteration in iterator:
            if self.verbose:
                iterator.set_postfix({
                    "best_score":
                    best_score,
                    "n_features":
                    len(self.best_features),
                    "p_matrix_c":
                    len(self.afg.pheromone_construction),
                    "p_matrix_s":
                    len(self.afg.pheromone_selection),
                    "distance_from_best":
                    distance_from_best
                })
            ants = [
                Ant(ant_id=i,
                    alpha=self.alpha,
                    beta=beta,
                    metric=self.metric,
                    use_initials=self.use_initials,
                    cache_loo=self.cache_loo,
                    cache_heuristic=self.cache_heuristic,
                    step=self.step) for i in range(self.ants)
            ]
            beta *= (1 - self.beta_evaporation_rate)
            results = []
            for ant in ants:
                results.append(
                    ant.run(X=X,
                            y=y,
                            graph=self.afg,
                            random_generator=random,
                            parallel=self.parallel,
                            max_errors=self.max_errors))
            results = np.array(results)

            self.afg.update_pheromone_matrix_evaporation(self.evaporation_rate)
            distance_from_best = np.mean(np.abs(results - best_score))
            best_ant = np.argmax(results)
            if self.update_strategy == "best":
                ant = ants[best_ant]
                self.afg.intensify(ant.current_features,
                                   self.intensification_factor, 1,
                                   self.use_initials)
            else:
                for ant_score, ant in zip(results, ants):
                    self.afg.intensify(ant.current_features,
                                       self.intensification_factor, ant_score,
                                       self.use_initials)

            if results[best_ant] >= best_score:
                iterations_without_improvement = 0
                ant = ants[best_ant]
                best_score = results[best_ant]
                self.best_features = ant.current_features
            else:
                iterations_without_improvement += 1
                if iterations_without_improvement > self.early_stopping:
                    break

        self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
        if self.final_selection == "BEST":
            pass
        else:
            #An ant traverses the graph deterministically to obtain the features
            final_ant = FinalAnt(ant_id=0,
                                 alpha=self.alpha,
                                 beta=beta,
                                 metric=self.metric,
                                 use_initials=self.use_initials,
                                 cache_loo=self.cache_loo,
                                 cache_heuristic=self.cache_heuristic,
                                 step=self.step)
            final_ant.run(X=X,
                          y=y,
                          graph=self.afg,
                          random_generator=random,
                          parallel=self.parallel)
            self.best_features = final_ant.current_features
        #Train model with final features
        self.classifier_.fit(
            np.concatenate(
                [feature.transform(X) for feature in self.best_features],
                axis=1), y)

        if self.save_features:
            #Save to features to dict
            translate_features(features=self.best_features,
                               feature_encoder=self.feature_encoder_,
                               categories=self.categories_,
                               path=self.path,
                               filename=self.filename)
        return self
Ejemplo n.º 13
0
class PazzaniWrapperNB(PazzaniWrapper):
    ''''Optimized version of Pazzani's wrapper for the Naive Bayes classifier.
        LOO cross validation
        Update, add, delete features
    '''
    def __init__(self, seed=None, strategy="BSEJ", verbose=0):
        super().__init__(seed=seed,
                         strategy=strategy,
                         verbose=verbose,
                         cv=None)

    def _generate_neighbors_bsej(self, current_columns, X):
        if X.shape[1] > 1:
            for column_to_drop in range(X.shape[1]):
                new_columns = current_columns.copy()
                del new_columns[column_to_drop]
                yield new_columns, column_to_drop, None, True  # Updated column list, columns to remove, columns to add, delete?
            for features in combinations(np.arange(X.shape[1]), 2):
                new_col_name = flatten([
                    current_columns[features[0]], current_columns[features[1]]
                ])
                new_columns = current_columns.copy()
                new_columns.append(tuple(new_col_name))
                columns_to_drop = sorted(features, reverse=True)
                del new_columns[columns_to_drop[0]]
                del new_columns[columns_to_drop[1]]

                combined_columns = combine_columns(X, list(features))
                yield new_columns, list(
                    columns_to_drop), combined_columns, False

    def fit_bsej(self, X, y):
        self.evaluate = memoize(_evaluate, attribute_to_cache="columns")
        current_best = X.copy()
        current_columns = deque(range(X.shape[1]))
        best_score = self.evaluate(self.classifier,
                                   current_best,
                                   y,
                                   columns=current_columns,
                                   fit=True)
        stop = False
        while not stop:
            update = False
            stop = True
            if self.verbose:
                print("Current Best: ", current_columns, " Score: ",
                      best_score)
            for new_columns, columns_to_delete, columns_to_add, delete in self._generate_neighbors_bsej(
                    current_columns, current_best):
                if delete:
                    action = "DELETE"
                    # Update classifier and get validation result
                    self.classifier.remove_feature(columns_to_delete)
                    neighbor = np.delete(current_best,
                                         columns_to_delete,
                                         axis=1)
                    score = self.evaluate(self.classifier,
                                          neighbor,
                                          y,
                                          columns=new_columns,
                                          fit=False)

                    # Restore the column for the next iteration
                    self.classifier.add_features(
                        current_best[:, columns_to_delete].reshape(-1, 1),
                        y,
                        index=[columns_to_delete])
                else:
                    action = "ADD"
                    self.classifier.add_features(columns_to_add, y)
                    self.classifier.remove_feature(columns_to_delete[0])
                    self.classifier.remove_feature(columns_to_delete[1])

                    neighbor = np.delete(current_best,
                                         columns_to_delete,
                                         axis=1)
                    neighbor = np.concatenate([neighbor, columns_to_add],
                                              axis=1)

                    score = self.evaluate(self.classifier,
                                          neighbor,
                                          y,
                                          columns=new_columns,
                                          fit=False)
                    if self.classifier.n_features_ == 1:
                        # We reverse it for insert order
                        self.classifier.add_features(
                            current_best[:, columns_to_delete], y)
                        self.classifier.remove_feature(0)
                    else:
                        self.classifier.remove_feature(neighbor.shape[1] - 1)
                        # We reverse it for insert order
                        self.classifier.add_features(
                            current_best[:, columns_to_delete],
                            y,
                            index=columns_to_delete)

                if self.verbose == 2:
                    print("\tNeighbor: ", new_columns, " Score: ", score)
                if score > best_score:
                    stop = False
                    best_columns = new_columns
                    best_action = action
                    best_score = score
                    best_columns_to_delete = columns_to_delete
                    update = True
                    if best_action == "ADD":
                        best_columns_to_add = columns_to_add
                    if score == 1.0:
                        stop = True
                        break
            if update:
                current_columns = best_columns
                if best_action == "DELETE":
                    current_best = np.delete(current_best,
                                             best_columns_to_delete,
                                             axis=1)
                    # Update best
                    self.classifier.remove_feature(best_columns_to_delete)
                else:
                    current_best = np.delete(current_best,
                                             best_columns_to_delete,
                                             axis=1)
                    current_best = np.concatenate(
                        [current_best, best_columns_to_add], axis=1)
                    # Update classifier
                    self.classifier.add_features(best_columns_to_add, y)
                    self.classifier.remove_feature(best_columns_to_delete[0])
                    self.classifier.remove_feature(best_columns_to_delete[1])

        if self.verbose:
            print("Final best: ", list(current_columns), " Score: ",
                  best_score)
        self.features_ = current_columns
        self.feature_transformer = lambda X: join_columns(
            X, columns=self.features_)
        model = self.classifier.fit(self.feature_transformer(X), y)
        return self

    def _generate_neighbors_fssj(self, current_columns, individual,
                                 original_data, available_columns):
        if available_columns:
            for index, col in enumerate(available_columns):
                new_columns = current_columns.copy()
                new_columns.append(col)
                new_available_columns = available_columns.copy()
                del new_available_columns[index]
                column_to_add = original_data[:, col].reshape(-1, 1)
                column_to_delete = None
                # New columns, Availables,ColumnToDelete,ColumnToAdd,Delete?
                yield new_columns, new_available_columns, column_to_delete, column_to_add, False
        if individual is not None and individual.shape[
                1] > 0 and available_columns:
            for features_index in product(np.arange(len(available_columns)),
                                          np.arange(len(current_columns))):
                features = available_columns[
                    features_index[0]], current_columns[features_index[1]]
                new_col_name = flatten([features[0], features[1]])

                new_available_columns = available_columns.copy()
                del new_available_columns[features_index[0]]

                new_columns = current_columns.copy()
                new_columns.append(tuple(new_col_name))
                del new_columns[features_index[1]]

                separated_columns = np.concatenate([
                    original_data[:, features[0]].reshape(-1, 1),
                    individual[:, features_index[1]].reshape(-1, 1)
                ],
                                                   axis=1)
                if isinstance(features[1], tuple):
                    features = list(features)
                    features[1] = list(features[1])
                    features = tuple(features)
                column_to_delete = features_index[1]
                combined_columns = combine_columns(separated_columns)
                column_to_add = combined_columns
                yield new_columns, new_available_columns, column_to_delete, column_to_add, True

    def fit_fssj(self, X, y):
        self.evaluate = memoize(_evaluate, attribute_to_cache="columns")
        current_best = None
        current_columns = deque()
        available_columns = list(range(X.shape[1]))
        best_score = -float("inf")
        stop = False
        while not stop:
            update = False
            stop = True
            # self.classifier.encode_data=True
            if self.verbose:
                print("Current Best: ", current_columns, " Score: ",
                      best_score, "Available columns: ", available_columns)
            for new_columns, new_available_columns, column_to_delete, column_to_add, delete in self._generate_neighbors_fssj(
                    current_columns=current_columns,
                    individual=current_best,
                    original_data=X,
                    available_columns=available_columns):
                if delete:
                    action = "JOIN"
                    # Update classifier and get validation result
                    self.classifier.add_features(column_to_add, y)
                    self.classifier.remove_feature(column_to_delete)

                    neighbor = np.concatenate([current_best, column_to_add],
                                              axis=1)
                    neighbor = np.delete(neighbor, column_to_delete, axis=1)
                    score = self.evaluate(self.classifier,
                                          neighbor,
                                          y,
                                          columns=new_columns,
                                          fit=False)

                    # Restore the column for the next iteration
                    if neighbor.shape[1] == 1:
                        self.classifier.fit(current_best, y)
                    else:
                        self.classifier.remove_feature(neighbor.shape[1] - 1)
                        self.classifier.add_features(
                            current_best[:, column_to_delete].reshape(-1, 1),
                            y,
                            index=[column_to_delete])

                else:
                    action = "ADD"
                    if current_best is None:
                        neighbor = column_to_add
                        self.classifier.fit(neighbor, y)
                    else:
                        neighbor = np.concatenate(
                            [current_best, column_to_add], axis=1)
                        self.classifier.add_features(column_to_add, y)

                    score = self.evaluate(self.classifier,
                                          neighbor,
                                          y,
                                          columns=new_columns,
                                          fit=False)

                    if current_best is None:
                        self.classifier = NaiveBayes(encode_data=True)
                    else:
                        self.classifier.remove_feature(neighbor.shape[1] - 1)

                if self.verbose == 2:
                    print("\tNeighbour: ", new_columns, " Score: ", score,
                          "Available columns: ", new_available_columns)

                if score > best_score:
                    stop = False
                    best_columns = new_columns
                    best_available_columns = new_available_columns
                    best_action = action
                    best_score = score
                    best_column_to_delete = column_to_delete
                    best_column_to_add = column_to_add
                    update = True
                    if score == 1.0:
                        stop = True
                        break
            if update:
                current_columns = best_columns
                available_columns = best_available_columns
                if best_action == "JOIN":
                    self.classifier.add_features(best_column_to_add, y)
                    self.classifier.remove_feature(best_column_to_delete)

                    current_best = np.concatenate(
                        [current_best, best_column_to_add], axis=1)
                    current_best = np.delete(current_best,
                                             best_column_to_delete,
                                             axis=1)
                else:
                    if current_best is None:
                        current_best = best_column_to_add
                        self.classifier.fit(current_best, y)
                    else:
                        current_best = np.concatenate(
                            [current_best, best_column_to_add], axis=1)
                        self.classifier.add_features(best_column_to_add, y)

        if self.verbose:
            print("Final best: ", list(current_columns), " Score: ",
                  best_score)
        self.features_ = current_columns
        self.feature_transformer = lambda X: join_columns(
            X, columns=self.features_)
        model = self.classifier.fit(self.feature_transformer(X), y)
        return self

    def evaluate(self, classifier, X, y, fit=True, columns=None):
        return _evaluate(classifier, X, y, fit=True, columns=None)
Ejemplo n.º 14
0
    def fit_fssj(self, X, y):
        self.evaluate = memoize(_evaluate, attribute_to_cache="columns")
        current_best = None
        current_columns = deque()
        available_columns = list(range(X.shape[1]))
        best_score = -float("inf")
        stop = False
        while not stop:
            update = False
            stop = True
            # self.classifier.encode_data=True
            if self.verbose:
                print("Current Best: ", current_columns, " Score: ",
                      best_score, "Available columns: ", available_columns)
            for new_columns, new_available_columns, column_to_delete, column_to_add, delete in self._generate_neighbors_fssj(
                    current_columns=current_columns,
                    individual=current_best,
                    original_data=X,
                    available_columns=available_columns):
                if delete:
                    action = "JOIN"
                    # Update classifier and get validation result
                    self.classifier.add_features(column_to_add, y)
                    self.classifier.remove_feature(column_to_delete)

                    neighbor = np.concatenate([current_best, column_to_add],
                                              axis=1)
                    neighbor = np.delete(neighbor, column_to_delete, axis=1)
                    score = self.evaluate(self.classifier,
                                          neighbor,
                                          y,
                                          columns=new_columns,
                                          fit=False)

                    # Restore the column for the next iteration
                    if neighbor.shape[1] == 1:
                        self.classifier.fit(current_best, y)
                    else:
                        self.classifier.remove_feature(neighbor.shape[1] - 1)
                        self.classifier.add_features(
                            current_best[:, column_to_delete].reshape(-1, 1),
                            y,
                            index=[column_to_delete])

                else:
                    action = "ADD"
                    if current_best is None:
                        neighbor = column_to_add
                        self.classifier.fit(neighbor, y)
                    else:
                        neighbor = np.concatenate(
                            [current_best, column_to_add], axis=1)
                        self.classifier.add_features(column_to_add, y)

                    score = self.evaluate(self.classifier,
                                          neighbor,
                                          y,
                                          columns=new_columns,
                                          fit=False)

                    if current_best is None:
                        self.classifier = NaiveBayes(encode_data=True)
                    else:
                        self.classifier.remove_feature(neighbor.shape[1] - 1)

                if self.verbose == 2:
                    print("\tNeighbour: ", new_columns, " Score: ", score,
                          "Available columns: ", new_available_columns)

                if score > best_score:
                    stop = False
                    best_columns = new_columns
                    best_available_columns = new_available_columns
                    best_action = action
                    best_score = score
                    best_column_to_delete = column_to_delete
                    best_column_to_add = column_to_add
                    update = True
                    if score == 1.0:
                        stop = True
                        break
            if update:
                current_columns = best_columns
                available_columns = best_available_columns
                if best_action == "JOIN":
                    self.classifier.add_features(best_column_to_add, y)
                    self.classifier.remove_feature(best_column_to_delete)

                    current_best = np.concatenate(
                        [current_best, best_column_to_add], axis=1)
                    current_best = np.delete(current_best,
                                             best_column_to_delete,
                                             axis=1)
                else:
                    if current_best is None:
                        current_best = best_column_to_add
                        self.classifier.fit(current_best, y)
                    else:
                        current_best = np.concatenate(
                            [current_best, best_column_to_add], axis=1)
                        self.classifier.add_features(best_column_to_add, y)

        if self.verbose:
            print("Final best: ", list(current_columns), " Score: ",
                  best_score)
        self.features_ = current_columns
        self.feature_transformer = lambda X: join_columns(
            X, columns=self.features_)
        model = self.classifier.fit(self.feature_transformer(X), y)
        return self
Ejemplo n.º 15
0
    def fit(self, X, y):
        # Parse input
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)

        # Reset the stored results for new fit
        self.reset_evaluation()

        # Generate rank
        if self.use_graph:
            # Construct the minimum graph and create rank
            graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph(
                X, y, ("AND", "OR", "XOR"))
            self.all_feature_constructors = graph.get_rank()
        elif self.prune is not None:
            # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y)
            feature_combinations = list(
                combinations(list(range(X.shape[1])),
                             2)) + [(i, i) for i in range(X.shape[1])]
            rank_pairs = [
                symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y)
                for i, j in feature_combinations
            ]
            rank_pairs_index = np.argsort(rank_pairs)[::-1]

            # Create the unsorted list
            self.all_feature_constructors = []
            for index in rank_pairs_index[:self.prune]:
                i, j = feature_combinations[index]
                if i == j:
                    from tfg.feature_construction import create_feature
                    self.all_feature_constructors.extend([
                        create_feature("OR", [(i, n), (i, m)])
                        for n, m in combinations(np.unique(X[:, i]), 2)
                    ])
                else:
                    self.all_feature_constructors.extend(
                        construct_features(X[:, [i, j]],
                                           operators=self.operators,
                                           same_feature=False))
        else:
            # Create the unsorted list of all features
            self.all_feature_constructors = construct_features(
                X, operators=self.operators)
        if self.verbose:
            print(
                f"Total number of constructed features: {len(self.all_feature_constructors)}"
            )
        self.all_feature_constructors.extend(
            [DummyFeatureConstructor(j) for j in range(X.shape[1])])
        self.symmetrical_uncertainty_rank = []

        # Sort the ranking
        for feature_constructor in self.all_feature_constructors:
            feature = feature_constructor.transform(X)
            su = symmetrical_uncertainty(f1=feature.flatten(), f2=y)
            self.symmetrical_uncertainty_rank.append(su)

        # Store the descending order index
        self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1]

        # If the initial variables are
        if self.use_initials:
            classifier = NaiveBayes(encode_data=False,
                                    n_intervals=self.n_intervals,
                                    metric=self.metric)
            classifier.fit(X, y)
            current_features = [
                DummyFeatureConstructor(j) for j in range(X.shape[1])
            ]

            # Store the backward result to reuse it for other executions
            self.initial_backward_features = backward_search(
                X, y, current_features, classifier)

        # Feature Subset Selection (FSS) from the rank
        self.filter_features(X, y)
        return self
Ejemplo n.º 16
0
class GeneticProgrammingFlexibleLogic(OptimizationMixin, TransformerMixin,
                                      ClassifierMixin, BaseEstimator):
    """GeneticProgramming for Feature Construction and Selection.


    Parameters
    ----------

    seed : int or None
        Seed to guarantee reproducibility

    individuals : int
        Number of individuals per population

    generations : int
        Number of generations 

    mutation_probability : float
        Probability for each individual of being mutated

    select : {rank,proportionate}
        Selection strategy

    mutation : {simple,complex}
        Mutation strategy

    combine : {truncation,elitism} 
        Population combination strategy

    n_intervals : int
        Number of intervals for the discretization of continous variables

    mixed : bool
        Mix heuristic and wrapper evaluation

    mixed_percentage : float
        Percentage of total iterations to do heuristic evaluation

    metric : {accuracy,f1-score}
        Target metric for the optimization process
    
    flexible_logic: bool
        Allow different individual sizes in the generation
    
    encode_data : bool, default=True
        Encode data when data is not encoded by default with an OrdinalEncoder
    
    verbose :int {0,1}, default = 1 
        Display process progress


    Attributes
    ----------
    classifier_ : NaiveBayes
        Base classifier used for prediction

    best_features_ : array-lik of Feature
        Array of selected Feature used for transforming new data
    """
    def simple_evaluate(self, individual, X, y):
        classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
        return classifier_.leave_one_out_cross_val(transform_features(
            individual[0] + individual[1], X),
                                                   y,
                                                   fit=True)

    def simple_evaluate_heuristic(self, individual, X, y):
        return compute_sufs_non_incremental(
            features=[f.transform(X) for f in chain(*individual[:2])], y=y)

    def fitness(self, population, X, y):
        evaluation = []
        for individual in population:
            evaluation.append((individual, self.evaluate(individual, X, y)))
        return evaluation

    def generate_population(self):
        population = []
        for _ in range(self.individuals):
            individual = ([], [], set())
            if self.flexible_logic:
                n_chromosomes = range(random.randint(1, self.size))
            else:
                n_chromosomes = range(self.size)

            for _ in n_chromosomes:
                operand1_feature = random.randint(0, self.n_features - 1)
                operand2_feature = random.randint(0, self.n_features - 1)
                if operand1_feature == operand2_feature:
                    op = 'OR'
                    operand1_value = random.randint(
                        0, self.unique_values[operand1_feature] - 1)
                    operand2_value = random.randint(
                        0, self.unique_values[operand1_feature] - 1)
                else:
                    op = random.choice(('OR', 'XOR', 'AND'))
                    operand1_value = random.randint(
                        0, self.unique_values[operand1_feature] - 1)
                    operand2_value = random.randint(
                        0, self.unique_values[operand2_feature] - 1)
                operands = []
                operands.append((operand1_feature, operand1_value))
                operands.append((operand2_feature, operand2_value))
                individual[1].append(
                    create_feature(operator=op, operands=operands))
            n_og_features = random.randint(0, self.n_features - 1)
            features = list(range(self.n_features))
            for f in random.sample(features, n_og_features):
                individual[0].append(DummyFeatureConstructor(feature_index=f))
                individual[2].add(f)
            population.append(individual)
        return population

    def mutate_complex(self, population, **kwargs):
        new_population = []
        for individual in population:
            if random.random() < self.mutation_probability:
                chromosomes_index = []
                if self.flexible_logic:
                    if len(individual[1]) > 0:
                        chromosomes_index = random.sample(
                            list(range(len(individual[1]))),
                            random.randint(1, len(individual[1])))
                    else:
                        op = random.choice(('OR', 'XOR', 'AND'))
                        operands = []
                        for _ in range(2):
                            feature_index = random.randint(
                                0, self.n_features - 1)
                            value = random.randint(
                                0, self.unique_values[feature_index] - 1)
                            operands.append((feature_index, value))
                        individual[1].append(
                            create_feature(operator=op, operands=operands))
                        new_population.append(individual)
                        continue

                else:
                    chromosomes_index = random.sample(
                        list(range(len(individual[1]))),
                        random.randint(1, len(individual[1])))

                for i in range(len(chromosomes_index)):
                    index = chromosomes_index[i]
                    if not self.flexible_logic:
                        feature = individual[1][index]
                        feature.op = random.choice(('OR', 'XOR', 'AND'))
                        for operand in feature.operands:
                            operand.feature_index = random.randint(
                                0, self.n_features - 1)
                            operand.value = random.randint(
                                0,
                                self.unique_values[operand.feature_index] - 1)
                    else:
                        a = random.random()
                        if a < 0.33:
                            feature = individual[1][index]
                            feature.op = random.choice(('OR', 'XOR', 'AND'))
                            for operand in feature.operands:
                                operand.feature_index = random.randint(
                                    0, self.n_features - 1)
                                operand.value = random.randint(
                                    0,
                                    self.unique_values[operand.feature_index] -
                                    1)
                        elif a < 0.66:
                            op = random.choice(('OR', 'XOR', 'AND'))
                            operands = []
                            for _ in range(2):
                                feature_index = random.randint(
                                    0, self.n_features - 1)
                                value = random.randint(
                                    0, self.unique_values[feature_index] - 1)
                                operands.append((feature_index, value))
                            individual[1].append(
                                create_feature(operator=op, operands=operands))

                        else:
                            del individual[1][index]
                            chromosomes_index = [
                                j - 1 if j > index else j
                                for j in chromosomes_index
                            ]

            if random.random() < self.mutation_probability:
                a = random.random()
                og_features = individual[0]
                included_features = individual[2]
                if (a < 0.33 and len(og_features) < self.n_features
                    ) or len(og_features) == 0:
                    selected = random.choice(
                        tuple(
                            set(list(range(0, self.n_features))) -
                            included_features))
                    included_features.add(selected)
                    og_features.append(DummyFeatureConstructor(selected))
                elif a < 0.66 and len(og_features) < self.n_features and len(
                        og_features) > 0:
                    selected = random.choice(
                        tuple(
                            set(list(range(0, self.n_features))) -
                            included_features))
                    index = random.randint(0, len(og_features) - 1)
                    feature = og_features[index].feature_index
                    og_features[index] = DummyFeatureConstructor(selected)
                    included_features.remove(feature)
                    included_features.add(selected)
                else:
                    index = random.randint(0, len(og_features) - 1)
                    feature = og_features[index].feature_index
                    del og_features[index]
                    included_features.remove(feature)

            if len(individual[0]) == 0 and len(individual[1]) == 0:
                og_features = individual[0]
                included_features = individual[2]
                selected = random.choice(
                    tuple(
                        set(list(range(0, self.n_features))) -
                        included_features))
                included_features.add(selected)
                og_features.append(DummyFeatureConstructor(selected))
            new_population.append(individual)
        return new_population

    def mutate_simple(self, population, **kwargs):
        new_population = []
        for individual in population:
            if random.random() < self.mutation_probability:
                chromosomes_index = []
                if self.flexible_logic:
                    if len(individual[1]) > 0:
                        chromosomes_index = random.sample(
                            list(range(len(individual[1]))),
                            random.randint(1, len(individual[1])))
                    else:
                        op = random.choice(('OR', 'XOR', 'AND'))
                        operands = []
                        for _ in range(2):
                            feature_index = random.randint(
                                0, self.n_features - 1)
                            value = random.randint(
                                0, self.unique_values[feature_index] - 1)
                            operands.append((feature_index, value))
                        individual[1].append(
                            create_feature(operator=op, operands=operands))
                        new_population.append(individual)
                        continue

                else:
                    chromosomes_index = random.sample(
                        list(range(len(individual[1]))),
                        random.randint(1, len(individual[1])))

                for i in range(len(chromosomes_index)):
                    index = chromosomes_index[i]
                    feature = individual[1][index]
                    if not self.flexible_logic:
                        feature.op = random.choice(('OR', 'XOR', 'AND'))
                        for operand in feature.operands:
                            operand.feature_index = random.randint(
                                0, self.n_features - 1)
                            operand.value = random.randint(
                                0,
                                self.unique_values[operand.feature_index] - 1)
                    else:
                        a = random.random()
                        if a < 0.33:
                            b = random.random()
                            if b < 0.2:
                                # Change operatior
                                feature.op = random.choice(
                                    ('OR', 'XOR', 'AND'))
                            elif b < 0.4:
                                # Change full operand
                                operand = feature.operands[0]
                                operand.value = random.randint(
                                    0,
                                    self.unique_values[operand.feature_index] -
                                    1)
                            elif b < 0.6:
                                # Change full operand
                                operand = feature.operands[1]
                                operand.value = random.randint(
                                    0,
                                    self.unique_values[operand.feature_index] -
                                    1)
                            elif b < 0.8:
                                # Change value
                                operand = feature.operands[0]
                                operand.value = random.randint(
                                    0,
                                    self.unique_values[operand.feature_index] -
                                    1)
                            else:
                                # Change value
                                operand = feature.operands[1]
                                operand.value = random.randint(
                                    0,
                                    self.unique_values[operand.feature_index] -
                                    1)

                        elif a < 0.66:
                            # Add feature
                            op = random.choice(('OR', 'XOR', 'AND'))
                            operands = []
                            for _ in range(2):
                                feature_index = random.randint(
                                    0, self.n_features - 1)
                                value = random.randint(
                                    0, self.unique_values[feature_index] - 1)
                                operands.append((feature_index, value))
                            individual[1].append(
                                create_feature(operator=op, operands=operands))

                        else:
                            # Remove feature
                            del individual[1][index]
                            chromosomes_index = [
                                j - 1 if j > index else j
                                for j in chromosomes_index
                            ]

            if random.random() < self.mutation_probability:
                a = random.random()
                og_features = individual[0]
                included_features = individual[2]
                if (a < 0.33 and len(og_features) < self.n_features
                    ) or len(og_features) == 0:
                    selected = random.choice(
                        tuple(
                            set(list(range(0, self.n_features))) -
                            included_features))
                    included_features.add(selected)
                    og_features.append(DummyFeatureConstructor(selected))
                elif a < 0.66 and len(og_features) < self.n_features and len(
                        og_features) > 0:
                    selected = random.choice(
                        tuple(
                            set(list(range(0, self.n_features))) -
                            included_features))
                    index = random.randint(0, len(og_features) - 1)
                    feature = og_features[index].feature_index
                    og_features[index] = DummyFeatureConstructor(selected)
                    included_features.remove(feature)
                    included_features.add(selected)
                else:
                    index = random.randint(0, len(og_features) - 1)
                    feature = og_features[index].feature_index
                    del og_features[index]
                    included_features.remove(feature)

            if len(individual[0]) == 0 and len(individual[1]) == 0:
                og_features = individual[0]
                included_features = individual[2]
                selected = random.choice(
                    tuple(
                        set(list(range(0, self.n_features))) -
                        included_features))
                included_features.add(selected)
                og_features.append(DummyFeatureConstructor(selected))
            new_population.append(individual)
        return new_population

    def elitism(self, population1, population2):
        maximum = max(population1, key=lambda x: x[1])
        minimum_index = min(enumerate(population2), key=lambda x: x[1][1])[0]
        population2[minimum_index] = maximum
        return population2

    def truncation(self, population1, population2):
        return sorted(population1 + population2,
                      reverse=True,
                      key=lambda x: x[1])[:len(population1)]

    def select_population(self, population):
        selected_individuals = []
        num_selected = len(population)
        totalFitness = sum(fitness for _, fitness in population)
        for _ in range(num_selected):
            cumulative_prob = 0.0
            r = random.random()
            for individual_with_fitness in population:
                cumulative_prob += individual_with_fitness[1] / totalFitness
                if r <= cumulative_prob:
                    selected_individuals.append(
                        self.copy_individual(individual_with_fitness[0]))
                    break
        return selected_individuals

    def select_population_rank(self, population):
        selected_individuals = []
        num_selected = len(population)
        totalRank = (num_selected * (num_selected + 1)) / 2
        population.sort(reverse=True, key=lambda x: x[1])
        for _ in range(num_selected):
            cumulative_prob = 0.0
            r = random.random()
            for i, individual_with_fitness in enumerate(population, start=1):
                cumulative_prob += (num_selected - i + 1) / totalRank
                if r <= cumulative_prob:
                    selected_individuals.append(
                        self.copy_individual(individual_with_fitness[0]))
                    break
        return selected_individuals

    def copy_individual(self, individual):
        return ([chrms.copy() for chrms in individual[0]],
                [chrms.copy()
                 for chrms in individual[1]], individual[2].copy())

    def fit(self, X, y):
        self.feature_encoder_ = CustomOrdinalFeatureEncoder()
        self.class_encoder_ = CustomLabelEncoder()

        if isinstance(X, pd.DataFrame):
            self.categories_ = X.columns
        if self.encode_data:
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        classifier_ = NaiveBayes(encode_data=False,
                                 n_intervals=self.n_intervals,
                                 metric=self.metric)
        self.n_features = X.shape[1]
        if self.encode_data:
            self.unique_values = [
                values.shape[0] for values in self.feature_encoder_.categories_
            ]
        else:
            self.unique_values = [
                np.unique(X[:, j]).shape[0] for j in range(X.shape[1])
            ]
        random.seed(self.seed)
        np.random.seed(self.seed)
        self.size = np.ceil(np.sqrt(X.shape[1]))
        best_individual = self.execute_algorithm(X, y)
        self.best_features = best_individual
        self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
        self.classifier_.fit(
            np.concatenate(
                [feature.transform(X) for feature in self.best_features],
                axis=1), y)
        return self

    def execute_algorithm(self, X, y):
        if self.mixed:
            self.evaluate = self.evaluate_heuristic
        else:
            self.evaluate = self.evaluate_wrapper
        population = self.generate_population()
        population_with_fitness = self.fitness(population, X, y)
        iterator = tqdm(range(self.generations),
                        leave=False) if self.verbose else range(
                            self.generations)
        for generation in iterator:
            if self.mixed and generation > int(
                    self.generations * self.mixed_percentage
            ) and self.evaluate == self.evaluate_heuristic:
                self.evaluate = self.evaluate_wrapper
                # Reevaluate for fair combination
                population_with_fitness = self.fitness([
                    individual_with_fitness[0]
                    for individual_with_fitness in population_with_fitness
                ], X, y)
            selected_individuals = self.selection(population_with_fitness)
            crossed_individuals = selected_individuals  # self.crossover(selected_individuals)
            mutated_individuals = self.mutation(crossed_individuals, X=X, y=y)
            new_population = self.fitness(mutated_individuals, X, y)
            population_with_fitness = self.combine(population_with_fitness,
                                                   new_population)

            # Obtaining population's statistics
            if self.verbose:
                best, mean = get_max_mean(population_with_fitness)
                iterator.set_postfix({
                    "Generation":
                    generation,
                    "hit_count":
                    self.evaluate.hit_count,
                    "populationLength":
                    len(population_with_fitness),
                    "best fitness":
                    best,
                    "mean fitness":
                    mean
                })

        best_individual = max(population_with_fitness, key=lambda x: x[1])[0]
        return best_individual[0] + best_individual[1]

    def reset_evaluation(self):
        self.evaluate_wrapper = memoize_genetic(self.simple_evaluate)
        self.evaluate_heuristic = memoize_genetic(
            self.simple_evaluate_heuristic)

    def set_params(self, **params):
        super().set_params(**params)
        if "selection" in params:
            if params["selection"] not in ("rank", "proportionate"):
                raise ValueError(
                    "Unknown selection parameter expected one of : " +
                    str(tuple(["rank", "proportionate"])))
            self.selection = self.select_population_rank if "rank" in params[
                "selection"] else self.select_population
        if "combine" in params:
            if params["combine"] not in ("elitism", "truncate"):
                raise ValueError(
                    "Unknown selection parameter expected one of : " +
                    str(tuple(["elitism", "truncate"])))
            self.combine = self.elitism if "elit" in params[
                "combine"] else self.truncation
        if "mutation" in params:
            if params["mutation"] not in ("complex", "simple"):
                raise ValueError(
                    "Unknown selection parameter expected one of : " +
                    str(tuple(["complex", "simple"])))
            self.mutation = self.mutate_simple if "simple" == params[
                "mutation"] else self.mutate_complex

    def __init__(self,
                 seed=None,
                 individuals=1,
                 generations=40,
                 mutation_probability=0.2,
                 selection="rank",
                 mutation="simple",
                 combine="elitism",
                 n_intervals=5,
                 metric="accuracy",
                 flexible_logic=True,
                 verbose=False,
                 encode_data=True,
                 mixed=True,
                 mixed_percentage=0.5):
        self.mixed_percentage = mixed_percentage
        self.mixed = mixed
        self.encode_data = encode_data
        self.flexible_logic = flexible_logic
        self.verbose = verbose
        self.n_intervals = n_intervals
        self.metric = metric
        self.seed = seed
        self.individuals = individuals
        self.generations = generations
        self.mutation_probability = mutation_probability

        self.selection = selection
        self.combine = combine
        self.mutation = mutation

        allowed_selection = ('rank', 'proportionate')
        allowed_combine = ('elitism', 'truncate')
        allowed_mutation = ('complex', 'simple')

        if self.selection not in allowed_selection:
            raise ValueError(
                "Unknown selection type: %s, expected one of %s." %
                (self.selection, selection))
        if self.combine not in allowed_combine:
            raise ValueError("Unknown combine type: %s, expected one of %s." %
                             (self.combine, combine))
        if self.mutation not in allowed_mutation:
            raise ValueError(
                "Unknown selection type: %s, expected one of %s." %
                (self.mutation, mutation))

        self.selection = self.select_population_rank if "rank" in selection else self.select_population
        self.combine = self.elitism if "elit" in combine else self.truncation
        self.mutation = self.mutate_simple if "simple" in mutation else self.mutate_complex
        self.reset_evaluation()
Ejemplo n.º 17
0
class RankerLogicalFeatureConstructor(TransformerMixin, ClassifierMixin,
                                      BaseEstimator):
    """First proposal: Hybrid-Ranker Wrapper.

    Build a ranking based on Symmetrical Uncertainty (SU) of every possible logical feature of depth 1
    (1 operator, 2 operands), using XOR, AND and OR operator. The steps are:
        - Find out combinations of values in database of every pair of features Xi, Xj:
            - Example: 
                - Xi = [1,2,3,2]
                - Xj = ['a','b','c','a']
                Possible combinations:
                    [(1,'a'),(2,'b'),(3,'c'),(2,'a')]
        - Apply operator to every combination:
            - Example: 
                - Xi = [1,2,3,2]
                - Xj = ['a','b','c','a']
                Possible combinations:
                    [(1,'a','AND'),(2,'b','AND'),(3,'c','AND'),(2,'a','AND'),
                    (1,'a','OR'),(2,'b','OR'),(3,'c','OR'),(2,'a','OR'),
                    (1,'a','XOR'),(2,'b','XOR'),(3,'c','XOR'),(2,'a','XOR')]
        - Add original variables to the list
        - Evaluate SU for every value in the list, and rank them
        - Go over the list following one of the two strategies proposed and evaluate 
          the subset based on a leave-one-out cross-validation with the NaiveBayes classifier.

    Parameters
    ----------
    strategy : str {eager,skip}
        After the ranking is built if the eager strategy is chosen we stop considering attributes
        when there is no improvement from one iteration to the next

    block_size : int, default=1
        Number of features that are added in each iteration

    encode_data : boolean
        Whether or not to encode the received data. If set to false the classifier 
        expects data to be encoded with an ordinal encoder.

    verbose : {boolean,int}
        If set to true it displays information of the remaining time 
        and inside variables.

    operators : array-like, deafult = ("XOR","AND","OR")
        Operators used for the constructed features.

    max_features : int, deafult = inf
        Maximum number of features to include in the selected subset

    max_iterations : int, deafult = inf
        Maximum number of iterations in the wrapper step.

    use_graph : bool, default = False 
        Generate Ranking from features obtained from the pruned-graph of the ACO algorithm.
        (Experimentation not carried out)

    use_initials: bool, default = False
        Force the set of initial features in the final solution. The set if trimmed with a backward elimination before-hand.

    Attributes
    ----------
    feature_encoder_ : CustomOrdinalFeatureEncoder or None
        Encodes data in ordinal way with unseen values handling if encode_data is set to True.

    class_encoder_ : LabelEncoder or None
        Encodes Data in ordinal way for the class if encode_data is set to True.

    all_feature_constructors: array-like
        List of FeatureConstructor objects with all the possible logical 
        features

    symmetrical_uncertainty_rank: array-like
        SU for every feature in all_feature_constructors

    rank : array-like
        Array of indexes corresponding to the sorted SU rank (in descending order).

    final_feature_constructors:
        Selected feature subset (list of constructors)

    classifier: NaiveBayes
        Classifier used in the wrapper and to perform predictions after fitting.

    """
    def __init__(self,
                 strategy="eager",
                 block_size=10,
                 encode_data=True,
                 n_intervals=5,
                 verbose=0,
                 operators=("AND", "OR", "XOR"),
                 max_features=float("inf"),
                 max_iterations=float("inf"),
                 metric="accuracy",
                 use_initials=False,
                 max_err=0,
                 prune=None,
                 use_graph=False):
        self.strategy = strategy
        self.block_size = max(block_size, 1)
        self.encode_data = encode_data
        self.verbose = verbose
        self.operators = operators
        self.max_features = max_features
        self.max_iterations = max_iterations
        self.n_intervals = n_intervals
        self.metric = metric
        self.max_err = max_err
        self.use_initials = use_initials
        self.prune = prune
        self.use_graph = use_graph

        allowed_strategies = ("eager", "skip")
        if self.strategy not in allowed_strategies:
            raise ValueError("Unknown operator type: %s, expected one of %s." %
                             (self.strategy, allowed_strategies))

    def fit(self, X, y):
        # Parse input
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)

        # Reset the stored results for new fit
        self.reset_evaluation()

        # Generate rank
        if self.use_graph:
            # Construct the minimum graph and create rank
            graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph(
                X, y, ("AND", "OR", "XOR"))
            self.all_feature_constructors = graph.get_rank()
        elif self.prune is not None:
            # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y)
            feature_combinations = list(
                combinations(list(range(X.shape[1])),
                             2)) + [(i, i) for i in range(X.shape[1])]
            rank_pairs = [
                symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y)
                for i, j in feature_combinations
            ]
            rank_pairs_index = np.argsort(rank_pairs)[::-1]

            # Create the unsorted list
            self.all_feature_constructors = []
            for index in rank_pairs_index[:self.prune]:
                i, j = feature_combinations[index]
                if i == j:
                    from tfg.feature_construction import create_feature
                    self.all_feature_constructors.extend([
                        create_feature("OR", [(i, n), (i, m)])
                        for n, m in combinations(np.unique(X[:, i]), 2)
                    ])
                else:
                    self.all_feature_constructors.extend(
                        construct_features(X[:, [i, j]],
                                           operators=self.operators,
                                           same_feature=False))
        else:
            # Create the unsorted list of all features
            self.all_feature_constructors = construct_features(
                X, operators=self.operators)
        if self.verbose:
            print(
                f"Total number of constructed features: {len(self.all_feature_constructors)}"
            )
        self.all_feature_constructors.extend(
            [DummyFeatureConstructor(j) for j in range(X.shape[1])])
        self.symmetrical_uncertainty_rank = []

        # Sort the ranking
        for feature_constructor in self.all_feature_constructors:
            feature = feature_constructor.transform(X)
            su = symmetrical_uncertainty(f1=feature.flatten(), f2=y)
            self.symmetrical_uncertainty_rank.append(su)

        # Store the descending order index
        self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1]

        # If the initial variables are
        if self.use_initials:
            classifier = NaiveBayes(encode_data=False,
                                    n_intervals=self.n_intervals,
                                    metric=self.metric)
            classifier.fit(X, y)
            current_features = [
                DummyFeatureConstructor(j) for j in range(X.shape[1])
            ]

            # Store the backward result to reuse it for other executions
            self.initial_backward_features = backward_search(
                X, y, current_features, classifier)

        # Feature Subset Selection (FSS) from the rank
        self.filter_features(X, y)
        return self

    def predict(self, X):
        X, _ = self.transform(X)
        if self.encode_data:
            return self.class_encoder_.inverse_transform(
                self.classifier.predict(X))
        return self.classifier.predict(X)

    def reset_evaluation(self):
        # Reset the memoize evaluations
        self.evaluate_leave_one_out_cross_val = memoize(evaluate_leave_one_out)

    def predict_proba(self, X):
        X, _ = self.transform(X)
        return self.classifier.predict_proba(X)

    def score(self, X, y):
        X, y = self.transform(X, y)
        return self.classifier.score(X, y)

    def filter_features(self, X, y):
        '''After the rank is built this perform the greedy wrapper search'''
        check_is_fitted(self)
        self.classifier = NaiveBayes(encode_data=False,
                                     n_intervals=self.n_intervals,
                                     metric=self.metric)
        current_score = np.NINF
        first_iteration = True
        current_features = []
        current_data = None
        if self.use_initials:
            # Original Features have already been taken into account
            rank_iter = filter(
                lambda x: not isinstance(self.all_feature_constructors[x],
                                         DummyFeatureConstructor),
                iter(self.rank))

            # Deep copy to avoid issues when modifying the list
            current_features = deepcopy(self.initial_backward_features)
            current_data = np.concatenate(
                [f.transform(X) for f in current_features], axis=1)

            # Get initial LOO score
            current_score = self.evaluate_leave_one_out_cross_val(
                self.classifier, current_features, current_data, y, fit=True)
        else:
            # Iterator over the sorted list of indexes
            rank_iter = iter(self.rank)

        if self.verbose:
            progress_bar = tqdm(total=len(self.rank),
                                bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        iteration = 0
        iterations_without_improvements = 0

        # Loop for including {block size} elements at a time
        # Rank is an iterator, so the for loop is not sequential!
        for feature_constructor_index in rank_iter:
            iteration += 1
            if self.verbose:
                progress_bar.set_postfix({
                    "n_features": len(current_features),
                    "score": current_score
                })
                progress_bar.update(1)
                progress_bar.refresh()

            # Add block size features
            new_X = [
                self.all_feature_constructors[feature_constructor_index].
                transform(X)
            ]
            selected_features = [
                self.all_feature_constructors[feature_constructor_index]
            ]
            for _ in range(self.block_size - 1):
                try:
                    index = next(rank_iter)
                    selected_features.append(
                        self.all_feature_constructors[index])
                    new_X.append(
                        self.all_feature_constructors[index].transform(X))
                    if self.verbose:
                        progress_bar.update(1)
                        progress_bar.refresh()
                except:
                    # Block size does not divide the number of elements in the rank. The search is halted
                    break

            # Evaluate features
            new_X = np.concatenate(new_X, axis=1)
            if iteration == 1 and not self.use_initials:
                current_data = new_X
                current_score = self.evaluate_leave_one_out_cross_val(
                    self.classifier,
                    selected_features,
                    current_data,
                    y,
                    fit=True)
                current_features = selected_features
                first_iteration = False
                if self.max_iterations <= iteration or (
                        len(current_features) +
                        self.block_size) > self.max_features:
                    break
                continue
            data = np.concatenate([current_data, new_X], axis=1)
            self.classifier.add_features(new_X, y)
            # LOO evaluation
            score = self.evaluate_leave_one_out_cross_val(self.classifier,
                                                          current_features +
                                                          selected_features,
                                                          data,
                                                          y,
                                                          fit=False)
            if score > current_score:
                current_score = score
                current_data = data
                current_features.extend(selected_features)
                iterations_without_improvements = 0
            else:
                iterations_without_improvements += 1
                # Remove last added block
                for feature_index_to_remove in range(
                        data.shape[1], data.shape[1] - new_X.shape[1], -1):
                    self.classifier.remove_feature(feature_index_to_remove - 1)
                if self.strategy == "eager" and self.max_err < iterations_without_improvements:
                    # Stops as soon as no impovement
                    break

            if self.max_iterations <= iteration or (
                    len(current_features) +
                    self.block_size) > self.max_features:
                break
        if self.verbose:
            progress_bar.close()
            print(
                f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}"
            )
        self.final_feature_constructors = current_features
        return self

    def transform(self, X, y=None):
        check_is_fitted(self)
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
            if y is not None:
                y = self.class_encoder_.transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        new_X = []
        for feature_constructor in self.final_feature_constructors:
            new_X.append(feature_constructor.transform(X))
        return np.concatenate(new_X, axis=1), y
Ejemplo n.º 18
0
 def simple_evaluate(self, individual, X, y):
     classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
     return classifier_.leave_one_out_cross_val(transform_features(
         individual[0] + individual[1], X),
                                                y,
                                                fit=True)
Ejemplo n.º 19
0
    def explore(self, X, y, graph, random_generator, parallel, max_errors=0):
        '''
        Search method that follows the following steps:
            1. The initial node is connected to all the others (roulette wheel selection is performed)
            2. There are 2 type of nodes (corresponding to an original feature (2.1) or corresponding to a value of a feature (2.2)):
                2.1. If the selected node is an original feature we add it to the selected subset and go to step 3.
                2.2. If the selected node is part of a logical feature then we select another node (the CONSTRUCTION step will not return full original features)
            3. Compute the score
                3.1. If it improves the previous one
                    3.1.1 Add the feature to the current subset
                    3.1.2 Update the score
                    3.1.3 Select another node (SELECTION step) 
                    3.1.4 Go to step 2
                3.2. If not, the exploration ends

        Note: Threading does not speed up the calculations as they are CPU bound and in python only I/O operations will benefit from this parallelism
              GPU improvement would reduce the time of the exploration.
        '''
        self.step = math.ceil(math.log2(X.shape[1]))
        self.current_features = []
        selected_nodes = set()
        constructed_nodes = set()
        classifier = NaiveBayes(encode_data=False, metric=self.metric)
        current_score = np.NINF
        score = 0
        if self.use_initials:
            self.current_features = [
                DummyFeatureConstructor(j) for j in range(X.shape[1])
            ]
            classifier.fit(X, y)
            current_transformed_features_numpy = np.concatenate(
                [f.transform(X) for f in self.current_features], axis=1)
            score = self.evaluate_loo(self.current_features, classifier,
                                      current_transformed_features_numpy, y)
            current_score = score
            selected_nodes.update(graph.get_original_ids())
        if len(self.current_features) == 0:
            current_transformed_features_numpy = None

        initial, pheromones, heuristics = graph.get_initial_nodes(
            selected_nodes)

        probabilities = self.compute_probability(pheromones, heuristics)
        index = self.choose_next(probabilities, random_generator)
        node_id, selected_node = initial[index]

        # SU variable contains the MIFS-SU for the selected variable
        current_su = 0
        su = heuristics[index]

        is_fitted = self.use_initials
        feature_constructor = None
        n_errors = 0
        number_steps = 1
        while True:
            current_score = score
            if selected_node[1] is None:
                # Original Feature
                feature_constructor = DummyFeatureConstructor(selected_node[0])
                selected_nodes.add(node_id)
            else:
                # Need to construct next feature and compute heuristic value for the feature to replace temporal su from half-var
                neighbours, pheromones = graph.get_neighbours(
                    selected_node, constructed_nodes, step="CONSTRUCTION")

                if len(neighbours) == 0:
                    break
                if self.beta != 0:
                    if parallel:
                        with concurrent.futures.ThreadPoolExecutor(
                        ) as executor:
                            futures = []
                            for neighbour in neighbours:
                                futures.append(
                                    executor.submit(
                                        self.compute_neighbour_sufs,
                                        neighbour=neighbour,
                                        transformed_features=
                                        current_transformed_features_numpy,
                                        constructors=self.current_features,
                                        selected_node=selected_node,
                                        current_su=current_su,
                                        X=X,
                                        y=y))
                            concurrent.futures.wait(
                                futures,
                                timeout=None,
                                return_when='ALL_COMPLETED')
                            su = [future.result() for future in futures]
                    else:
                        su = [
                            self.compute_neighbour_sufs(
                                neighbour=neighbour,
                                transformed_features=
                                current_transformed_features_numpy,
                                selected_node=selected_node,
                                constructors=self.current_features,
                                current_su=current_su,
                                X=X,
                                y=y) for neighbour in neighbours
                        ]
                else:
                    #Avoid unnecessary evaluation
                    su = np.ones(len(neighbours))

                probabilities = self.compute_probability(
                    pheromones, np.array(su))
                index = self.choose_next(probabilities, random_generator)

                su = su[index]
                feature_constructor = create_feature(
                    neighbours[index][2],
                    [selected_node, neighbours[index][1]])
                constructed_nodes.add(
                    frozenset(
                        (node_id, neighbours[index][0], neighbours[index][2])))
                node_id, selected_node = neighbours[index][:2]

            # Assess new feature
            transformed_feature = feature_constructor.transform(X)
            if is_fitted:
                classifier.add_features(transformed_feature, y)
            else:
                classifier.fit(transformed_feature, y)
                is_fitted = True
            if current_transformed_features_numpy is None:
                current_transformed_features_numpy = transformed_feature
            else:
                current_transformed_features_numpy = append_column_to_numpy(
                    current_transformed_features_numpy, transformed_feature)
            if number_steps >= self.step:
                score = self.evaluate_loo(
                    self.current_features + [feature_constructor], classifier,
                    current_transformed_features_numpy, y)
                if score <= current_score:
                    if n_errors >= max_errors:
                        break
                    else:
                        n_errors += 1
                else:
                    n_errors = 0
                number_steps = 0
            else:
                number_steps += 1
            current_su = su
            self.current_features.append(feature_constructor)
            current_score = score
            # Select next
            neighbours, pheromones = graph.get_neighbours(selected_node,
                                                          selected_nodes,
                                                          step="SELECTION")

            # Compute heuristic
            su = []
            if len(neighbours) == 0:
                break
            if self.beta != 0:
                for neighbour, pheromone in zip(neighbours, pheromones):
                    if neighbour[1][1] is None:
                        # Original variable
                        su.append(
                            self.compute_sufs_cached(
                                current_su,
                                current_transformed_features_numpy,
                                X[:, neighbour[1][0]],
                                self.current_features,
                                DummyFeatureConstructor(neighbour[1][0]),
                                y,
                                minimum=0))
                    else:
                        # This is a temporal variable that will not be finally selected but only used to calculate the heuristic
                        su.append(
                            self.compute_sufs_cached(
                                current_su,
                                current_transformed_features_numpy,
                                X[:, neighbour[1][0]] == neighbour[1][1],
                                self.current_features,
                                FeatureOperand(feature_index=neighbour[1][0],
                                               value=neighbour[1][1]),
                                y,
                                minimum=0))

            else:
                su = np.ones(len(neighbours))
            probabilities = self.compute_probability(pheromones, np.array(su))
            index = self.choose_next(probabilities, random_generator)

            su = su[index]
            node_id, selected_node = neighbours[index][:2]
        if current_transformed_features_numpy.shape[1] > len(
                self.current_features):
            current_transformed_features_numpy = np.delete(
                current_transformed_features_numpy, -1, axis=1)
        self.final_score = self.evaluate_loo(
            self.current_features, classifier,
            current_transformed_features_numpy, y)

        return self.final_score