Ejemplo n.º 1
0
def feature_selection_k(k, X, y, model_name=None, dataset_name=None):
    if model_name == "RFE":
        model = RFE(n_features_to_select=k, estimator=DecisionTreeClassifier())
    elif model_name == "FSFS":
        model = SequentialFeatureSelector(n_features_to_select=k,
                                          estimator=DecisionTreeClassifier(),
                                          direction="forward")

    start = time.process_time()
    X_trans = model.fit_transform(X, y)
    end = time.process_time()
    runtime = end - start
    X_re = model.inverse_transform(X_trans)
    error = ((X_re - X)**2).mean().mean()
    return {
        "dataset": dataset_name,
        "model": model_name,
        "k": k,
        "runtime": runtime,
        "reconstruction_error": error
    }, model
Ejemplo n.º 2
0
def rfe(dataset, n_components, save_to_file=False):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    estimator = SVR(kernel="linear")
    model = RFE(estimator, n_features_to_select=n_components, step=1)
    model = model.fit(X, y)

    n_samples = X.shape[0]

    X_fitted = model.transform(X)
    kurt = pd.DataFrame(X_fitted)
    kurt = kurt.kurt(axis=0)
    kurt = kurt.abs().mean()
    X_inverse = model.inverse_transform(X_fitted)
    reconstruction_error = np.linalg.norm(X - X_inverse) / n_samples
    if save_to_file:
        X_fitted = pd.DataFrame(X_fitted)
        X_fitted['label'] = y.values
        X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv')
    return kurt, reconstruction_error
Ejemplo n.º 3
0
def myFS(data, act_labels, output_folder, experiment_name, avg='binary'):

    # Split data, act_labels into train, test sets
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        act_labels,
                                                        test_size=0.1,
                                                        random_state=13)

    # Calculate test recall when running multiple iterations for k = 1 to num_features
    num_features = data.shape[1]
    num_features = 25
    values = list(range(1, num_features + 1))
    rn = np.random.RandomState(13)
    random_seeds = list(rn.randint(1, 1000000, 1))
    recalls = []
    for r in random_seeds:
        recall_temp = []
        for k in values:
            estimator = DecisionTreeClassifier(random_state=r)
            fs = RFE(estimator, n_features_to_select=k,
                     step=1).fit(X_train, y_train)
            y_pred = fs.predict(X_test)
            rec = recall_score(y_test, y_pred, average=avg)
            recall_temp.append(rec)
        recalls.append(recall_temp)

    avg_recall = np.mean(np.array(recalls), axis=0)
    recall_std = np.std(np.array(recalls), axis=0)

    # Plot the average recall for each k and include the error bars
    plt.errorbar(list(range(1, num_features + 1)), avg_recall, recall_std)
    plt.xticks(ticks=list(range(1, num_features + 1)),
               labels=list(range(1, num_features + 1)))
    plt.xlabel('# Components')
    plt.ylabel('Recall Score')
    plt.title('Average Recall Score for K Components Over 20 Iterations')
    plt.savefig(output_folder + '/' + experiment_name +
                '_fs_component_recall_score.png')
    plt.close()
    plt.figure()

    # Plot the reconstruction errors
    errors = []
    for r in random_seeds:
        mses = []
        for k in values:
            estimator = DecisionTreeClassifier(random_state=r)
            fs = RFE(estimator, n_features_to_select=k,
                     step=1).fit(X_train, y_train)
            trans_data = fs.transform(X_train)
            rec_data = fs.inverse_transform(trans_data)
            mse = MSE(rec_data, X_train.values)
            mses.append(mse)
            errors.append(mses)
    avg_errors = np.mean(np.array(errors), axis=0)
    std_errors = np.std(np.array(errors), axis=0)

    plt.errorbar(list(range(1, num_features + 1)), avg_errors, std_errors)
    plt.xticks(ticks=list(range(num_features)),
               labels=list(range(1, num_features + 1)))
    plt.xlabel('# Components')
    plt.ylabel('Reconstruction Error')
    plt.title('Average Reconstruction Error for K Components')
    plt.savefig(output_folder + '/' + experiment_name +
                '_fs_component_reconstruction_error.png')
    plt.close()
    plt.figure()

    # Run one more time with the optimal k value found above
    votes = [
        np.argmax(avg_recall),
        np.argmax(avg_recall + recall_std),
        np.argmax(avg_recall - recall_std)
    ]
    results = np.zeros(num_features)
    for v in votes:
        results[v] = results[v] + 1
    k = np.argmax(results)
    estimator = DecisionTreeClassifier(random_state=13)
    start_time = time.time()
    fs = RFE(estimator, n_features_to_select=k, step=1).fit(X_train, y_train)
    end_time = time.time()
    final_time = end_time - start_time

    return fs, final_time
print('\n')
X = df.iloc[:, 0:-1].values
y = df.iloc[:, -1].values
regr = linear_model.LinearRegression()
estimator = linear_model.LinearRegression()
#feature ranking with recursive feature elemination
selector = RFE(estimator, 4)  #4 is the number of features to select
selector.fit(X, y)
print(selector.n_features_)
print('\n')
print(selector.support_)
print('\n')
print(selector.ranking_)
print('\n')
p = selector.transform(X)
q = selector.inverse_transform(p)
l = []
for i in range(len(df.columns) - 1):
    l.append((selector.ranking_[i], df.columns[i]))
print(l)
print('\n')
l.sort()
print(l)
print('\n')
for i in range(4):
    print(l[i][1])
print('\n')
#Using SelectKBest
print("Using SelectKBest:")
import numpy
from sklearn.feature_selection import SelectKBest