def experiment(dataset_directory):
    train, validation, test = loading.load_dataset(dataset_directory)
    features = train.feature_names

    values_of_k = range(5, 50, 5)

    metrics = [
        selection.metric_chi2,
        selection.metric_random,
        selection.metric_infogain
    ]

    results = []

    for kfeatures in values_of_k:
        for metric in metrics:
            metric_name = metric.__name__
            print
            print "Testing k=%d, metric=%s" % (kfeatures, metric_name)

            selector = selection.feature_selector(metric, kfeatures, train.data, train.target)
            selected_indices = selection.get_selected_feature_indices(selector)

            train_data_selected = selection.filter_features(selector, train.data)
            test_data_selected = selection.filter_features(selector, test.data)

            train_acc, train_f1, train_pr_auc = selection.train_test_eval(train_data_selected, train.target,
                                                                          train_data_selected, train.target)
            test_acc, test_f1, test_pr_auc = selection.train_test_eval(train_data_selected, train.target,
                                                                       test_data_selected, test.target)
            selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc)

            results.append(dict(
                kfeatures=kfeatures,
                metric=metric_name,
                train_accuracy=train_acc,
                train_f1=train_f1,
                train_pr_auc=train_pr_auc,
                test_accuracy=test_acc,
                test_f1=test_f1,
                test_pr_auc=test_pr_auc,
            ))

            output_name = dataset_directory / "features_%s_%d.csv" % (metric_name, kfeatures)
            with open(output_name, 'wb') as out:
                selection.list_selected(features, selected_indices, out=out)
                print "Features saved to %s" % output_name

    print
    print "Using all the features (Logistic Regression):"
    train_acc, train_f1, train_pr_auc = selection.train_test_eval(train.data, train.target, train.data, train.target)
    test_acc, test_f1, test_pr_auc = selection.train_test_eval(train.data, train.target, test.data, test.target)
    selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc)

    results.append(dict(
        kfeatures=len(features),
        metric='logreg',
        train_accuracy=train_acc,
        train_f1=train_f1,
        train_pr_auc=train_pr_auc,
        test_accuracy=test_acc,
        test_f1=test_f1,
        test_pr_auc=test_pr_auc,
    ))

    print
    print "Using all the features (SVM):"
    train_acc, train_f1, train_pr_auc = selection.train_test_eval(train.data, train.target, train.data, train.target,
                                                                  model='svm')
    test_acc, test_f1, test_pr_auc = selection.train_test_eval(train.data, train.target, test.data, test.target,
                                                               model='svm')
    selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc)

    results.append(dict(
        kfeatures=len(features),
        metric='svm',
        train_accuracy=train_acc,
        train_f1=train_f1,
        train_pr_auc=train_pr_auc,
        test_accuracy=test_acc,
        test_f1=test_f1,
        test_pr_auc=test_pr_auc,
    ))

    experiment_stats = dataset_directory / "feature_selection_experiment.csv"

    with open(experiment_stats, 'wb') as stats:
        writer = csv.DictWriter(stats, fieldnames=(
            'metric', 'kfeatures',
            'train_accuracy', 'train_f1', 'train_pr_auc',
            'test_accuracy', 'test_f1', 'test_pr_auc'))
        writer.writeheader()
        writer.writerows(results)

    print "Saved results in %s" % experiment_stats
Example #2
0
from loading import load_dataset
import selection as s

print
print "Loading a test dataset"
train, validation, test = load_dataset('test_data')

print
print "With all features..."
acc, f1, auc = s.train_test_eval(train.data, train.target, test.data, test.target)
s.print_metrics(acc, f1, auc)

features = train.feature_names
kfeatures = 2
print
print "With Chi-squared..."
selected = s.select_and_eval(s.metric_chi2, kfeatures, train.data, train.target, test.data, test.target)
s.print_selected(features, selected)

print
print "With random..."
selected = s.select_and_eval(s.metric_random, kfeatures, train.data, train.target, test.data, test.target)
s.print_selected(features, selected)

print
print "With infogain..."
selected = s.select_and_eval(s.metric_infogain, kfeatures, train.data, train.target, test.data, test.target)
s.print_selected(features, selected)
def experiment(dataset_directory):
    train, validation, test = loading.load_dataset(dataset_directory)
    features = train.feature_names

    values_of_k = range(5, 50, 5)

    metrics = [
        selection.metric_chi2, selection.metric_random,
        selection.metric_infogain
    ]

    results = []

    for kfeatures in values_of_k:
        for metric in metrics:
            metric_name = metric.__name__
            print
            print "Testing k=%d, metric=%s" % (kfeatures, metric_name)

            selector = selection.feature_selector(metric, kfeatures,
                                                  train.data, train.target)
            selected_indices = selection.get_selected_feature_indices(selector)

            train_data_selected = selection.filter_features(
                selector, train.data)
            test_data_selected = selection.filter_features(selector, test.data)

            train_acc, train_f1, train_pr_auc = selection.train_test_eval(
                train_data_selected, train.target, train_data_selected,
                train.target)
            test_acc, test_f1, test_pr_auc = selection.train_test_eval(
                train_data_selected, train.target, test_data_selected,
                test.target)
            selection.print_metrics(train_acc, train_f1, train_pr_auc,
                                    test_acc, test_f1, test_pr_auc)

            results.append(
                dict(
                    kfeatures=kfeatures,
                    metric=metric_name,
                    train_accuracy=train_acc,
                    train_f1=train_f1,
                    train_pr_auc=train_pr_auc,
                    test_accuracy=test_acc,
                    test_f1=test_f1,
                    test_pr_auc=test_pr_auc,
                ))

            output_name = dataset_directory / "features_%s_%d.csv" % (
                metric_name, kfeatures)
            with open(output_name, 'wb') as out:
                selection.list_selected(features, selected_indices, out=out)
                print "Features saved to %s" % output_name

    print
    print "Using all the features (Logistic Regression):"
    train_acc, train_f1, train_pr_auc = selection.train_test_eval(
        train.data, train.target, train.data, train.target)
    test_acc, test_f1, test_pr_auc = selection.train_test_eval(
        train.data, train.target, test.data, test.target)
    selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc,
                            test_f1, test_pr_auc)

    results.append(
        dict(
            kfeatures=len(features),
            metric='logreg',
            train_accuracy=train_acc,
            train_f1=train_f1,
            train_pr_auc=train_pr_auc,
            test_accuracy=test_acc,
            test_f1=test_f1,
            test_pr_auc=test_pr_auc,
        ))

    print
    print "Using all the features (SVM):"
    train_acc, train_f1, train_pr_auc = selection.train_test_eval(train.data,
                                                                  train.target,
                                                                  train.data,
                                                                  train.target,
                                                                  model='svm')
    test_acc, test_f1, test_pr_auc = selection.train_test_eval(train.data,
                                                               train.target,
                                                               test.data,
                                                               test.target,
                                                               model='svm')
    selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc,
                            test_f1, test_pr_auc)

    results.append(
        dict(
            kfeatures=len(features),
            metric='svm',
            train_accuracy=train_acc,
            train_f1=train_f1,
            train_pr_auc=train_pr_auc,
            test_accuracy=test_acc,
            test_f1=test_f1,
            test_pr_auc=test_pr_auc,
        ))

    experiment_stats = dataset_directory / "feature_selection_experiment.csv"

    with open(experiment_stats, 'wb') as stats:
        writer = csv.DictWriter(stats,
                                fieldnames=('metric', 'kfeatures',
                                            'train_accuracy', 'train_f1',
                                            'train_pr_auc', 'test_accuracy',
                                            'test_f1', 'test_pr_auc'))
        writer.writeheader()
        writer.writerows(results)

    print "Saved results in %s" % experiment_stats