Ejemplo n.º 1
0
cv_scores = []
scores_validation = []

# we are working with a composite estimator: 
# a pipeline of feature selection followed by SVC. Thus to give the name of the parameter that we want to tune we need to give the name of the step in
# the pipeline, followed by the name of the parameter, with ‘__’ as a separator.
# We are going to tune the parameter 'k' of the step called 'anova' in the pipeline. Thus we need to address it as 'anova__k'.
# Note that GridSearchCV takes an n_jobs argument that can make it go much faster
grid = GridSearchCV(anova_svc, param_grid={'anova__k': k_range},n_jobs=-1)
nested_cv_scores = cross_val_score(grid, X, y)
classification_accuracy = np.mean(nested_cv_scores)
print("Classification accuracy: %.4f / Chance level: %f" %
      (classification_accuracy, 1. / n_conditions))

for k in k_range:
    feature_selection.k = k
    cv_scores.append(np.mean(
    cross_val_score(anova_svc, X[subs == 1], y[subs == 1])))
    print("CV score: %.4f" % cv_scores[-1])

    anova_svc.fit(X[subs == 1], y[subs == 1])
    y_pred = anova_svc.predict(X[subs == 0])
    scores_validation.append(np.mean(y_pred == y[subs == 0]))
    print("score validation: %.4f" % scores_validation[-1])


# ---STEP 5---
#flipping the martix backinto an image
coef = svc.coef_
print(coef)
Ejemplo n.º 2
0
anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)])

### Cross validation ##########################################################

anova_svc.fit(X, y)
y_pred = anova_svc.predict(X)

from sklearn.cross_validation import LeaveOneLabelOut, cross_val_score
cv = LeaveOneLabelOut(session[session < 10])

k_range = [10, 15, 30, 50, 150, 300, 500, 1000, 1500, 3000, 5000]
cv_scores = []
scores_validation = []

for k in k_range:
    feature_selection.k = k
    cv_scores.append(np.mean(
        cross_val_score(anova_svc, X[session < 10], y[session < 10])))
    print "CV score", cv_scores[-1]

    anova_svc.fit(X[session < 10], y[session < 10])
    y_pred = anova_svc.predict(X[session == 10])
    scores_validation.append(np.mean(y_pred == y[session == 10]))
    print "score validation", scores_validation[-1]


from matplotlib import pyplot as plt
plt.figure(figsize=(6, 4))
plt.plot(cv_scores, label='Cross validation scores')
plt.plot(scores_validation, label='Left-out validation data scores')
plt.xticks(np.arange(len(k_range)), k_range)
score = clf.score(test_X, test_Y)	
print "decision tree baseline ", score
print

## try again with feature selections
##kbest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline

anova_filter = SelectKBest(f_regression, k=5)
clf = tree.DecisionTreeClassifier(max_depth = 6)
anova_clf = Pipeline([('anova', anova_filter), ('svc', clf)])

for x in range(2,len(features_train[0])+1,1):
	anova_filter.k = x
	pipeline = anova_clf.fit(features_train, labels_train)
	print "feat kbest :",x, " score: ", pipeline.score(test_X, test_Y)	
print

##
## variance threshold
from sklearn.feature_selection import VarianceThreshold
anova_filter = VarianceThreshold()
clf = tree.DecisionTreeClassifier(max_depth = 6)
anova_clf = Pipeline([('anova', anova_filter), ('svc', clf)])

for x in range(0,len(features_train[0])+1,1):
	variance = x/float(len(features_train[0])*5 )
	anova_filter.threshold = variance
	pipeline = anova_clf.fit(features_train, labels_train)
Ejemplo n.º 4
0
    X_train, y_train = get_data(data[0], data[1])
    X_test, y_test = get_data(data[0], data[1]+1)

    feature_len = np.size(X_train, 1)

    selectK = SelectKBest(f_classif, k="all")
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        selectK.fit(X_train, y_train)

    for k in range(500, feature_len, 100) + [feature_len]:
    # for k in range(2051, 2055):
        print "k = ", k
        sys.stdout.flush()
        selectK.k = k

        X_train_Sel = selectK.transform(X_train)
        data_train = xgb.DMatrix(X_train_Sel, label=y_train)

        start = time.time()
        bst = xgb.train(param, data_train, num_round)
        train_time = round(time.time() - start, 2)

        X_test_Sel = selectK.transform(X_test)
        data_test = xgb.DMatrix(X_test_Sel, label=y_test)

        start = time.time()
        prob = bst.predict(data_test)
        test_time = round(time.time() - start, 2)
Ejemplo n.º 5
0
        ("svm", LinearSVC()),
    ])
    # More than 20 is too much
    params = {"select__k": list(range(2, 20))}

    # Run 2 jobs at the same time, also print the progress into console
    # Here I use StratifiedKFold with 10 folds as CV for searching
    searcher = GridSearchCV(estimator,
                            params,
                            scoring="f1",
                            n_jobs=2,
                            cv=StratifiedKFold(labels, 10),
                            verbose=1)
    searcher.fit(features, labels)

    selector.k = searcher.best_params_["select__k"]

else:
    # The result I got is 9
    selector.k = 9

features = selector.fit_transform(features, labels)
# Get selected features using numpy array indexing
# all_features contains "poi" which isn't a feature
selected_features = np.array(all_features[1:])[selector.get_support()]

sys.stdout.write("Done\n")

sys.stdout.write("Generating final dataset...   ")
sys.stdout.flush()
Ejemplo n.º 6
0
skf = StratifiedKFold(n_splits=n_splits, random_state=42)

perfs = np.zeros(10)
fold_index = 0
for train_index, test_index in skf.split(x, y):
    print("fold:", fold_index + 1)

    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    selector = SelectKBest(score_func=score_func)
    selector.fit(x_train, y_train)

    for i, k in enumerate(np.arange(10, 101, 10)):
        selector.k = k
        x_train_selected = selector.transform(x_train)
        x_test_selected = selector.transform(x_test)

        clf.fit(x_train_selected, y_train)
        y_pred = clf.predict(x_test_selected)

        accu = accuracy_score(y_test, y_pred)
        print("selected features:", k, "accu:", accu)
        perfs[i] += accu

    fold_index += 1

print("n_splits:", n_splits)
print(FILTER_METHOD, DATASET, FINAL_CLASSIFIER)
perfs /= 5