コード例 #1
0
	def _calculate_cv_error(base_clf, best_rate, X, y, is_y_noise, clean_type, 
							max_nb_feats, major_oob_label):

		errors = []

		skf = StratifiedKFold(n_splits=NoiseDetectionEnsemble.k_folds, 
								shuffle=True)

		for train_idxs, val_idxs in skf.split(X=range(len(y)), y=y):

			train_X = DataHelper.select_rows(X, train_idxs, copy=False)
			train_y = DataHelper.select_rows(y, train_idxs, copy=False)
			train_is_y_noise = DataHelper.select_rows(is_y_noise, train_idxs,
												copy=False)
	
			clean_train = NoiseDetectionEnsemble._clean_noisy_data(train_X,
													train_y, train_is_y_noise,
													clean_type, major_oob_label)

			train_X, train_y, adapted_rate = DataHelper.adapt_rate(clean_train[0], 
																clean_train[1], 
																best_rate)

			ensemble = RF(501, n_jobs=-1, max_features="sqrt")
			ensemble.fit(train_X, train_y)

			val_X = DataHelper.select_rows(X, val_idxs, copy=False)
			val_y = DataHelper.select_rows(y, val_idxs, copy=False)

			predictions = ensemble.predict(val_X)
			error = MetricsHelper.calculate_error_score(val_y, predictions)
			errors.append(error)

		return mean(errors)
コード例 #2
0
	def choose_algorithm(clf, clean_type, train_X, noisy_train_y,
						noisy_idxs, max_nb_feats):
		chosen_rate = nan
		chosen_threshold = nan
		chosen_X = None
		chosen_y = None
		chosen_clf = None
		true_filtered = 0

		if clean_type == None:
			chosen_X = train_X
			chosen_y = noisy_train_y
			chosen_clf = clf

		elif clean_type == "maj":
			filt_X, filt_y = MajorityFiltering.run(train_X, 
												   noisy_train_y)
			chosen_X = filt_X
			chosen_y = filt_y
			chosen_clf = MajorityFiltering.get_ensemble()
			true_filtered = MetricsHelper.calculate_true_filter(chosen_y.index,
															noisy_idxs)
		else:
			algorithm_data = NoiseDetectionEnsemble.run(clf, clean_type,
										   		train_X,
								   		   		noisy_train_y, 
								   		   		max_nb_feats)
			chosen_rate = algorithm_data[0]
			chosen_threshold = algorithm_data[1]
			chosen_X = algorithm_data[2]
			chosen_y = algorithm_data[3]
			chosen_X, chosen_y, adapted_rate = DataHelper.adapt_rate(chosen_X,
														chosen_y, chosen_rate)

			chosen_clf = RF(n_estimators=501, max_features="sqrt", n_jobs=-1)

			true_filtered = MetricsHelper.calculate_true_filter(chosen_y.index,
															noisy_idxs)

		tot_filtered = len(train_X)-len(chosen_X.index.unique())
		false_filtered = tot_filtered-true_filtered

		return [chosen_rate, chosen_threshold, chosen_X, chosen_y, chosen_clf,
				true_filtered/len(train_X), false_filtered/len(train_X)]
コード例 #3
0
	def _first_stage(base_clf, X, y, max_nb_feats):
		min_error = float64(INF)
		ideal_rate = None
		best_ensemble = None

		for rate in NoiseDetectionEnsemble.sampling_rates:

			X_adapted, y_adapted, adapted_rate = DataHelper.adapt_rate(X, y, rate)

			ensemble = NoiseDetectionEnsemble.get_ensemble(base_clf, True,
												adapted_rate, max_nb_feats)

			ensemble.fit(X_adapted, y_adapted)

			error = (1-ensemble.oob_score_)*100

			if error < min_error - NoiseDetectionEnsemble.EPS:
				min_error = error
				ideal_rate = rate
				best_ensemble = ensemble

		return (best_ensemble, ideal_rate, min_error)