def _calculate_cv_error(base_clf, best_rate, X, y, is_y_noise, clean_type, max_nb_feats, major_oob_label): errors = [] skf = StratifiedKFold(n_splits=NoiseDetectionEnsemble.k_folds, shuffle=True) for train_idxs, val_idxs in skf.split(X=range(len(y)), y=y): train_X = DataHelper.select_rows(X, train_idxs, copy=False) train_y = DataHelper.select_rows(y, train_idxs, copy=False) train_is_y_noise = DataHelper.select_rows(is_y_noise, train_idxs, copy=False) clean_train = NoiseDetectionEnsemble._clean_noisy_data(train_X, train_y, train_is_y_noise, clean_type, major_oob_label) train_X, train_y, adapted_rate = DataHelper.adapt_rate(clean_train[0], clean_train[1], best_rate) ensemble = RF(501, n_jobs=-1, max_features="sqrt") ensemble.fit(train_X, train_y) val_X = DataHelper.select_rows(X, val_idxs, copy=False) val_y = DataHelper.select_rows(y, val_idxs, copy=False) predictions = ensemble.predict(val_X) error = MetricsHelper.calculate_error_score(val_y, predictions) errors.append(error) return mean(errors)
def choose_algorithm(clf, clean_type, train_X, noisy_train_y, noisy_idxs, max_nb_feats): chosen_rate = nan chosen_threshold = nan chosen_X = None chosen_y = None chosen_clf = None true_filtered = 0 if clean_type == None: chosen_X = train_X chosen_y = noisy_train_y chosen_clf = clf elif clean_type == "maj": filt_X, filt_y = MajorityFiltering.run(train_X, noisy_train_y) chosen_X = filt_X chosen_y = filt_y chosen_clf = MajorityFiltering.get_ensemble() true_filtered = MetricsHelper.calculate_true_filter(chosen_y.index, noisy_idxs) else: algorithm_data = NoiseDetectionEnsemble.run(clf, clean_type, train_X, noisy_train_y, max_nb_feats) chosen_rate = algorithm_data[0] chosen_threshold = algorithm_data[1] chosen_X = algorithm_data[2] chosen_y = algorithm_data[3] chosen_X, chosen_y, adapted_rate = DataHelper.adapt_rate(chosen_X, chosen_y, chosen_rate) chosen_clf = RF(n_estimators=501, max_features="sqrt", n_jobs=-1) true_filtered = MetricsHelper.calculate_true_filter(chosen_y.index, noisy_idxs) tot_filtered = len(train_X)-len(chosen_X.index.unique()) false_filtered = tot_filtered-true_filtered return [chosen_rate, chosen_threshold, chosen_X, chosen_y, chosen_clf, true_filtered/len(train_X), false_filtered/len(train_X)]
def _first_stage(base_clf, X, y, max_nb_feats): min_error = float64(INF) ideal_rate = None best_ensemble = None for rate in NoiseDetectionEnsemble.sampling_rates: X_adapted, y_adapted, adapted_rate = DataHelper.adapt_rate(X, y, rate) ensemble = NoiseDetectionEnsemble.get_ensemble(base_clf, True, adapted_rate, max_nb_feats) ensemble.fit(X_adapted, y_adapted) error = (1-ensemble.oob_score_)*100 if error < min_error - NoiseDetectionEnsemble.EPS: min_error = error ideal_rate = rate best_ensemble = ensemble return (best_ensemble, ideal_rate, min_error)