from sklearn.cross_validation import StratifiedKFold, LeavePLabelOut, cross_val_score from sklearn.preprocessing import StandardScaler, MinMaxScaler import xgboost as xgb from matplotlib import pyplot as plt from utils import * from spp_00_load_data import load_features nsubject = 1 # feat_select = ['stat'] feat_select = ['sp_entropy'] # feat_select = ['mfj'] # filename_tr = 'sp2016_feat_train_{0}_stat_20160915'.format(nsubject) # Xt, y, aFeatNames, aFiles_tr, plabels, data_q = get_from_10_min(filename_tr) X, y, aFeatNames, aFiles_tr, plabels, data_q = load_features( 'train', nsubject, feat_select) ind = np.sum(np.isnan(X), axis=0) < 50 X = X[:, ind] print 'Subject: ', nsubject print 'Original dataset' print X.shape print y.shape print list(aFeatNames) y = y.ravel() # clf = xgb.XGBClassifier() # parameters = { # 'max_depth': range(2, 31, 5), # 'gamma': [0, 2],
# sall = ['stat', 'spectral', 'sp_entropy', 'mfj', 'corr'] # feat_select = [sall, sall, sall] feat_select = [['stat'], ['stat'], ['stat']] REMOVE_COVARIATE_SHIFT = True pp = PreprocessPipeline(remove_outliers=True, standardize=True) d_data_train = dict() d_data_test = dict() for i in range(0, 3): nsubject = i + 1 XTRAIN, ytrain, aFeatNames_tr, aFiles_tr, plabels_tr, data_q_tr = load_features( 'train', nsubject, feat_select[i]) XTEST, ytest, aFeatNames_ts, aFiles_ts, plabels_ts, data_q_ts = load_features( 'test', nsubject, feat_select[i]) pp.fit(XTRAIN, XTEST, drop_nan=True) print 'Subject: ', nsubject print 'Original dataset' print XTRAIN.shape print ytrain.shape XTRAIN, ytrain, plabels_tr = drop_data_quality_thr(XTRAIN, ytrain, plabels_tr, data_q_tr, 10) XTRAIN = pp.transform(XTRAIN)
import numpy as np from matplotlib import pyplot as plt import sys from utils import * from spp_00_load_data import load_features nsubject = 1 # feat_select = ['stat'] # feat_select = ['spectral'] # feat_select = ['sp_entropy'] feat_select = ['mfj'] # feat_select = ['spectral', 'sp_entropy'] XTRAIN, ytrain, aFeatNames, aFiles_tr, plabels, data_q = load_features( 'train', nsubject, feat_select) XTEST, ytest, aFeatNames_ts, dummy4, dummy5, dummy3 = load_features( 'test', nsubject, feat_select) XTRAIN, ytrain, XTEST, aFeatNames, plabels, data_q = \ preprocess_pipeline(XTRAIN, ytrain, XTEST, aFeatNames, plabels, data_q, verbose=True) print 'Original dataset' print 'TRAIN:', XTRAIN.shape print 'ytrain', ytrain.shape thr = 10 XTRAIN, ytrain, plabels = drop_data_quality_thr(XTRAIN, ytrain, plabels, data_q, thr) print '\nRemoved data quality with treshold: ', thr
from python.utils_learning import OutliersWinsorization from sklearn.linear_model import lasso_stability_path from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from spp_00_load_data import load_features from utils import * nsubject = 2 bappend_test = False # feat_select = ['stat'] feat_select = ['mfj', 'sp_entropy'] # feat_select = ['spectral', 'sp_entropy'] X_train, y_train, aFeatNames, aFiles_tr, p, data_q = load_features( 'train', nsubject, feat_select) # X_test, y_test, dummy1, aFiles_ts, dummy2, dummy3 = load_feaures('test', nsubject, feat_select) X_test = X_train.copy() y_test = y_train.copy() aFiles_ts = aFiles_tr ind = np.sum(np.isnan(X_train), axis=0) < 50 X_train = X_train[:, ind] aFeatNames = [s for i, s in enumerate(aFeatNames) if ind[i] == True] X_test = X_test[:, ind] ''' drop nans ''' X_train, y_train, dummy4 = drop_nan(X_train, y_train, y_train.copy()) X_test, y_test, dummy5 = drop_nan(X_test, y_test, y_test.copy()) print 'Subject: ', nsubject
def load_features_and_preprocess(nsubject, feat_select, settings, verbose=True): qthr = settings.qthr remove_covariates = settings.remove_covariate_shift remove_outliers = settings.remove_outliers standardize = settings.standardize drop_nan = settings.drop_nan # XTRAIN, ytrain, aFeatNames_tr, aFiles_tr, plabels_tr, data_q_tr = load_features('train', nsubject, feat_select) # XTEST, ytest, aFeatNames_ts, aFiles_ts, plabels_ts, data_q_ts = load_features('test', nsubject, feat_select) data_tr = load_features('train', nsubject, feat_select) data_ts = load_features('test', nsubject, feat_select) XTRAIN, ytrain, plabels_tr, data_q_tr = data_tr['X'], data_tr[ 'y'], data_tr['plabels'], data_tr['data_q'] XTEST, ytest, plabels_ts = data_ts['X'], data_ts['y'], data_ts['plabels'] aFeatNames_tr = data_tr['aFeatNames'] aFeatNames_ts = data_ts['aFeatNames'] # data['X'] = X # data['y'] = y # data['aFeatNames'] = afeatnames # data['aFiles'] = aFiles # data['plabels'] = plabels # data['plabels_10min'] = p_labels_10min # data['data_q'] = data_q if verbose: print '############ Subject: ', nsubject, ' ########### ' print ' -- Features: ', '_'.join(feat_select) pp = PreprocessPipeline(remove_outliers=remove_outliers, standardize=standardize) pp.fit(XTRAIN, XTEST) if verbose: print ' -- Original dataset' print 'TRAIN:', XTRAIN.shape print 'ytrain', ytrain.shape if drop_nan: XTRAIN, ytrain, plabels_tr, ind_nan_tr = drop_data_quality_thr( XTRAIN, ytrain, plabels_tr, data_q_tr, qthr) ind_nan_tr = ind_nan_tr[~ind_nan_tr] else: ind_nan_tr = np.any(np.isnan(XTRAIN), axis=1) ind_nan_ts = np.any(np.isnan(XTEST), axis=1) XTEST[ind_nan_ts] = 0 ytest = ytest.ravel() plabels_tr = plabels_tr.ravel() plabels_ts = plabels_ts.ravel() # XTEST, ytest, plabels_ts, ind_nan_ts = drop_data_quality_thr(XTEST, ytest, plabels_ts, data_q_ts, qthr) if verbose: print ' -- Removed data quality with treshold: ', qthr print 'TRAIN :', XTRAIN.shape print 'ytrain:', ytrain.shape print 'XTEST :', XTEST.shape print 'ytest:', ytest.shape XTRAIN = pp.transform(XTRAIN) XTEST = pp.transform(XTEST) if remove_covariates: l_feat_remove = load_removed_features(nsubject, feat_select) # l_feat_remove_all = load_removed_features(nsubject, ['stat_spectral_sp_entropy_mfj_corr']) # l_feat_remove += l_feat_remove_all XTRAIN, aFeatNames_tr, ind_remove = remove_features_by_name( XTRAIN, aFeatNames_tr, l_feat_remove) XTEST, aFeatNames_ts, ind_remove = remove_features_by_name( XTEST, aFeatNames_ts, l_feat_remove) if verbose: print '-- Removed features with covariate shift: ' print 'TRAIN :', XTRAIN.shape print 'XTEST :', XTEST.shape data_tr['X'] = XTRAIN data_tr['y'] = ytrain data_tr['aFeatNames'] = aFeatNames_tr data_tr['plabels'] = plabels_tr data_tr['ind_nan'] = ind_nan_tr data_ts['X'] = XTEST data_ts['y'] = ytest data_ts['plabels'] = plabels_ts data_ts['aFeatNames'] = aFeatNames_ts data_ts['ind_nan'] = ind_nan_ts # data_tr = [XTRAIN, ytrain, aFeatNames_tr, aFiles_tr, plabels_tr, data_q_tr, ind_nan_tr] # data_ts = [XTEST, ytest, aFeatNames_ts, aFiles_ts, plabels_ts, data_q_ts, ind_nan_ts] return data_tr, data_ts