def compare_classifiers(X, y, ests, scoring, trials, inner_splits, outer_splits=None, randcv_budget=20): results = [] # For every model type for label, steps, p_grid in ests: cv_scores = np.empty(trials, dtype=float) cv_estimators = np.empty(trials, dtype=object) # Collect results from multiple trials of (nested) CV # TODO: RepeatedKFold? RepeatedStratifiedKFold? for trial in range(trials): est = imb_pipe.Pipeline(steps) cv_scores[trial], cv_estimators[trial] = nested_cv( X, y, est, p_grid, scoring, inner_splits, outer_splits, randcv_budget) results.append((label, cv_scores, cv_estimators)) return results
def no_resample(classifier): print('*** NO RESAMPLE ***') pipe = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), classifier, ]) X, y = prepare_data() y_pred = model_selection.cross_val_predict(pipe, X, y, cv=cv, n_jobs=-1) c = Counter(y) return y, y_pred, c
def undersample(classifier): print('*** UNDERSAMPLE ***') pipe = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', under_sampling.RandomUnderSampler()), classifier, ]) X, y = prepare_data() y_pred = model_selection.cross_val_predict(pipe, X, y, cv=cv, n_jobs=-1) c = Counter(under_sampling.RandomUnderSampler().fit_sample(X, y)[1]) return y, y_pred, c
def load_rutgers_with_quantiles(): from glob import glob files = glob('../../featureGenerator/datasets/dataset-2-rutgers-wifi' + '/with-quantiles/*.csv', recursive=True) traces = [parse_rutgers_with_quantiles(df) for df in files] return traces cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) pipe_logreg = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', over_sampling.RandomOverSampler()), ('clf', linear_model.LogisticRegression()), ]) pipe_dtree = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', over_sampling.RandomOverSampler()), ('clf', tree.DecisionTreeClassifier()), ]) @memory.cache def prepare_data(): dataset = load_rutgers_with_quantiles() print('Rutgers loaded ...')
elif prr <= 0.1: return 'bad' else: return 'interm.' features = ['rssi', 'rssi_std', 'rssi_avg'] cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) #classifier = ('logreg', linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr')) classifier = ('dtree', tree.DecisionTreeClassifier()) pipe = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', over_sampling.RandomOverSampler()), classifier, ]) @memory.cache def constant(pipe): print('*** ZERO PADDING interpolation ***') dataset = [] for df in load_rutgers(): df.loc[df['received'] == 0, 'rssi'] = np.nan dataset.append(df) print('Rutgers loaded ...') dataset = CustomInterpolation(source='rssi', strategy='constant', constant=0).fit_transform(dataset)
else: return 'interm.' @memory.cache def load_rutgers(): return list(get_traces()) features = ['rssi', 'rssi_std', 'rssi_avg'] cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) pipe = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', over_sampling.RandomOverSampler()), #('clf', tree.DecisionTreeClassifier(max_depth=3)), ('linear', linear_model.LogisticRegression(solver='ovr')), ]) @memory.cache def different_window_sizes(W_PRR, W_HISTORY): print(f'*** PRR={W_PRR}, HISTORY={W_HISTORY} ***') dataset = load_rutgers() print('Rutgers loaded ...') dataset = CustomInterpolation(source='rssi', strategy='constant', constant=0).fit_transform(dataset) print('Interpolation applied ...')
features_pipeline = ppl.make_union(engineered_feature_pipeline1, engineered_feature_pipeline2, engineered_feature_pipeline3, engineered_feature_pipeline4) sampling_pipeline = imbppl.make_pipeline( over_sampling.RandomOverSampler(random_state=9565)) model_pipeline = imbppl.make_pipeline( LogisticRegression(multi_class='multinomial', penalty='l2', random_state=9546, solver="lbfgs")) pipe = imbppl.Pipeline([('prep', features_pipeline), ('sample', sampling_pipeline), ('clf', model_pipeline)]) y = d_in.hand X = d_in.loc[:, 's1':'c5'] # produces a copy # split - results in < 5 observations for a the smallest class (need for sampling) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=9565) # training individual steps X_tr_feat = features_pipeline.fit_transform(X_train, y_train)
# Every feature #[x[0] + x[1] for x in it.product(['rssi', 'rssi_avg', 'rssi_std'], ['^-4', '^-3', '^-2', '', '^-1', '^2', '^3', '^4'])] ] @memory.cache def load_rutgers(): return list(get_traces()) cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) pipe_logreg = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', over_sampling.RandomOverSampler()), ('logistic', linear_model.LogisticRegression(solver='lbfgs', max_iter=1e3, multi_class='ovr')), ]) pipe_dtree = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', over_sampling.RandomOverSampler()), ('DTree', tree.DecisionTreeClassifier()), ]) @memory.cache def prepare_data(): dataset = load_rutgers() print('Rutgers loaded ...')