def do_all(subfile="EEGbears.csv"): cols = ["HandStart", "FirstDigitTouch", "BothStartLoadPhase", "LiftOff", "Replace", "BothReleased"] ids_tot = [] pred_tot = [] for subject in range(1, 13): features_train, labels_train, nevents, _, ntrtimes, ica, FTtstep, _ = preprocessing.preprocess(subject=subject) # train classifiers. Note we can't use just one classifier object # because some events overlap so we want to be able to predict combinations of classes classifiers = [SKLearnClf() for event in range(nevents)] for event in range(nevents): classifiers[event].fit(features_train, labels_train[:, event]) # read and prepare test data features_test, _, _, _, ntesttimes, _, _, ids = preprocessing.preprocess(subject=subject, train=False, ica=ica) ids_tot.append(ids) # get predictions for individual time steps ntimebins = features_test.shape[0] predlabels = np.zeros((ntimebins, nevents)) for event in range(nevents): predlabels[:, event] = classifiers[event].predict_proba(features_test)[:, 1] predevents = preprocessing.labels_to_events(predlabels, FTtstep, ntesttimes) pred_tot.append(predevents) print("Finished subject " + str(subject) + ".") # create pandas object for sbmission, write to file submission = pd.DataFrame(index=np.concatenate(ids_tot), columns=cols, data=np.concatenate(pred_tot)) submission.to_csv(subfile, index_label="id", float_format="%.3f") return submission
def crossvalidation(subject=1): time.clock() features_train, labels_train, nevents, _, ntrtimes, ica, FTtstep, _ = preprocessing.preprocess( subject=subject, series=range(1, 7) ) preptime = time.clock() print("Preprocessing took " + str(preptime) + " seconds.") # train classifiers. Note we can't use just one classifier object # because some events overlap so we want to be able to predict combinations of classes classifiers = [SKLearnClf() for event in range(nevents)] for event in range(nevents): classifiers[event].fit(features_train, labels_train[:, event]) traintime = time.clock() - preptime print("Trained the classifiers in " + str(traintime) + " seconds.") # read and prepare test data features_cv, labels_cv, _, events_cv, ncvtimes, _, _, _ = preprocessing.preprocess( subject=subject, train=True, series=range(7, 9), ica=ica ) events_cv = events_cv.astype(int) # I don't know why but it's an object array before this # separate some data for cross-validation # features_train, features_cv, labels_train, labels_cv = cross_validation.train_test_split( # features, labels, test_size = 0.3) # naively score classifiers on training set trscores = np.zeros((nevents)) for event in range(nevents): trscores[event] = classifiers[event].score(features_train, labels_train[:, event]) print("Scores on training set in binned time: " + str(trscores)) # naively score classifiers on CV set testscores = np.zeros((nevents)) for event in range(nevents): testscores[event] = classifiers[event].score(features_cv, labels_cv[:, event]) print("Scores on CV set in binned time: " + str(testscores)) # generate ROC curves for CV set in binned time predlabels_cv = np.transpose([classifiers[e].predict_proba(features_cv)[:, 1] for e in range(nevents)]) rocscoresbinned = ROCcurve(predlabels_cv, labels_cv) print("For binned time...") print("Areas under ROC curves:") print(rocscoresbinned) print("Average ROC score:" + str(np.mean(rocscoresbinned))) # generate ROC curves for CV set in real time predevents_cv = preprocessing.labels_to_events(predlabels_cv, FTtstep, ncvtimes) rocscoresreal = ROCcurve(predevents_cv, events_cv) print("For real time...") print("Areas under ROC curves:") print(rocscoresreal) print("Average ROC score:" + str(np.mean(rocscoresreal))) return predevents_cv, events_cv, features_cv, classifiers, np.mean(rocscoresbinned), np.mean(rocscoresreal)