def test_fetch_error(): with raises(ValueError, match='is not a dataset available.'): fetch_datasets(filter_data=tuple(['rnd'])) with raises(ValueError, match='dataset with the ID='): fetch_datasets(filter_data=tuple([-1])) with raises(ValueError, match='dataset with the ID='): fetch_datasets(filter_data=tuple([100])) with raises(ValueError, match='value in the tuple'): fetch_datasets(filter_data=tuple([1.00]))
def __extract_binarized_imbalanced_datasets(): for dataset_name, dataset_values in fetch_datasets().items(): write_dataset_to_csv("./binarized-datasets/" + dataset_name + ".csv", dataset_values) # if __name__ == '__main__': # __labelize_dataset("E:/python-workspace/resampler/binarized-datasets/" # "2_Class_Data_February_Cleaned_with_custom_header.csv", # "E:/python-workspace/resampler/binarized-datasets/custom_ds.csv") #__extract_binarized_imbalanced_datasets()
def load_dataset(data_name): load_data = fetch_datasets(verbose=True)[data_name] print(load_data.data.shape) print(Counter(load_data.target)) X = pd.DataFrame(load_data.data) y = pd.DataFrame(load_data.target, columns=['Label']) return X, y
def load_datasets(dataset, names): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) else: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = [ "feature_" + str(i) for i in range(0, data['data'].shape[1]) ] X = data['data'] y = data['target'] y[y != 1] = 0 names.add(dataset) output = [] output.append(X.shape[0]) output.append(X.shape[1]) output.append(float(format(len(abs(y[y != 1])) / sum(y[y == 1]), '.2f'))) return output
def get_dataset(dataset): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) elif dataset not in ['bloob', 'circle', 'moon']: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = [ "feature_" + str(i) for i in range(0, data['data'].shape[1]) ] X = data['data'] y = data['target'] y[y != 1] = 0 return X, y, cl_names
def test_documentation_example(): """Test basic code example shown in documentation""" from imblearn.datasets import fetch_datasets datasets = fetch_datasets(filter_data=['oil']) X, y = datasets['oil']['data'], datasets['oil']['target'] labels, counts = np.unique(y, return_counts=True) assert counts[0] > counts[1] kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 100}, smote_args={'k_neighbors': 10}) X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) labels, counts = np.unique(y_resampled, return_counts=True) assert counts[0] == counts[1]
def print_examples(): ts = fetch_datasets()['thyroid_sick'] print(ts.data.shape) target_classes = sorted(Counter(ts.target).items()) print(sorted(Counter(ts.target).items())) ds = load_ds('../datasets/binarized-datasets/thyroid_sic_2.data') labels = ['Target classes'] healty, sick = ([len(list(filter(lambda x: x[-1] == 0, ds)))], [len(list(filter(lambda x: x[-1] == 1, ds)))]) # healty = [target_classes[0][1]] # sick = [target_classes[1][1]] x = np.arange(len(labels)) # the label locations width = 0.35 # the width of the bars fig, ax = plt.subplots() rects1 = ax.bar(x - width / 2, healty, width, label='Healthy') rects2 = ax.bar(x + width / 2, sick, width, label='Sick') # Add some text for labels, title and custom x-axis tick labels, etc. ax.set_ylabel('Number of samples') ax.set_xticks(x) ax.set_xticklabels(labels) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') ax.legend() def autolabel(rects): """Attach a text label above each bar in *rects*, displaying its height.""" for rect in rects: height = rect.get_height() ax.annotate( '{}'.format(height), xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3), # 3 points vertical offset textcoords="offset points", ha='center', va='bottom') autolabel(rects1) autolabel(rects2) fig.tight_layout() plt.show()
def trial(name, sampling_strategy, k_neighbors, n_jobs): setup = f''' from imblearn.datasets import fetch_datasets from imblearn.over_sampling import SMOTE sampling_strategy = '{sampling_strategy}' k_neighbors = {k_neighbors} n_jobs = {n_jobs} dataset = fetch_datasets()['{name}'] X, y = dataset.data, dataset.target smote = SMOTE(sampling_strategy, k_neighbors=k_neighbors, n_jobs=n_jobs, random_state=0) ''' setup = textwrap.dedent(setup).strip() t = timeit('smote.fit_resample(X, y)', setup=setup, number=100) dataset = fetch_datasets()[name] X, y = dataset.data, dataset.target smote = SMOTE(sampling_strategy, k_neighbors=k_neighbors, n_jobs=n_jobs, random_state=0) X_resampled, y_resampled = smote.fit_resample(X, y) idx = -len(X) X_new, y_new = X_resampled[idx:], y_resampled[idx:] return X_new, y_new, t
from sklearn.metrics import balanced_accuracy_score from imblearn.metrics import geometric_mean_score from matplotlib import pyplot as plt import seaborn as sns from seaborn import scatterplot from numpy import where from collections import Counter import numpy as np # get_ipython().run_line_magic('matplotlib', 'inline') st.title("Skripsiku") name = 'pen_digits' dataset = fetch_datasets()[name] X = dataset.data y = dataset.target df = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1) st.subheader('Dataset Name :') st.write(name) st.write(df) cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True) for train_index, test_index, in cv.split(X, y): # st.write("Train: \n", train_index, "\nValidation:\n", test_index) X_train, X_test = X[train_index], X[test_index]
def fetch(*args, **kwargs): return fetch_datasets(*args, download_if_missing=True, **kwargs)
def templet(sampler_name, sample_ratio): """ 模板方法 :param sampler_name: 采样算法名 :param sample_ratio: 采样比例 :return: """ dataset = fetch_datasets()['satimage'] X = dataset.data y = dataset.target # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) sb = None if sampler_name == 'CART': sb = DummySampler() elif sampler_name == 'SMOTE': sb = SMOTE(N=sample_ratio, k_neighbors=5, random_state=42) elif sampler_name == 'Border1': sb = BorderSMOTE(N=sample_ratio, m_neighbors=9, k_neighbors=5, random_state=42, kind='borderline1') elif sampler_name == 'Border2': sb = BorderSMOTE(N=sample_ratio, m_neighbors=9, k_neighbors=5, random_state=42, kind='borderline2') elif sampler_name == 'ADASYN': sb = ADASYN(bata=sample_ratio, k_neighbors=5, random_state=42) elif sampler_name == 'Safe-level': sb = SafeLevelSMOTE(N=sample_ratio, k_neighbors=5, random_state=42) else: pass X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) # 采样 model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) # write2dic fill_dic('precision', sampler_name, sample_ratio, precision) fill_dic('recall', sampler_name, sample_ratio, recall) fill_dic('f1', sampler_name, sample_ratio, f1) fill_dic('auc', sampler_name, sample_ratio, auc) fill_dic('gmean', sampler_name, sample_ratio, gmean) print('%s %.1f building id transforming took %fs!' % (sampler_name, sample_ratio, time.time() - start_time))
ax.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") ax.set_ylabel('True label') ax.set_xlabel('Predicted label') ############################################################################### # Load an imbalanced dataset ############################################################################### # We will load the UCI SatImage dataset which has an imbalanced ratio of 9.3:1 # (number of majority sample for a minority sample). The data are then split # into training and testing. satimage = fetch_datasets()['satimage'] X, y = satimage.data, satimage.target X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) ############################################################################### # Classification using a single decision tree ############################################################################### # We train a decision tree classifier which will be used as a baseline for the # rest of this example. ############################################################################### # The results are reported in terms of balanced accuracy and geometric mean # which are metrics widely used in the literature to validate model trained on # imbalanced set.
print(f"Total train time: {clf_copy._total_train_time}") print(f"Total fit time: {clf_copy._fit_time}") print(f"Total iterations: {clf_copy._iter_count}") print(f"Final Sampling Probability: {clf_copy._sampling_proba}") # Save results try: results_df = pd.concat( [pd.read_csv(output_path, index_col=0), pd.DataFrame(results)]) except FileNotFoundError: results_df = pd.DataFrame(results) results_df.to_csv(output_path) if __name__ == "__main__": database = fetch_datasets() for dataset_name in DATASETS: for estimator_ in ESTIMATORS: for cost_scaling_ in COST_SCALINGS: try: run_experiment(dataset=database[dataset_name], estimator=estimator_, cost_scaling=cost_scaling_, output_path=OUTPUT_PATH, verbose=True) print( f"Completed experiment on {dataset_name} dataset with {type(estimator_)} model " + f"and cost scaling {cost_scaling_}") except BlockingIOError: print( f"Error running experiment on {dataset_name} dataset with {type(estimator_)} model "
cut_perc = 0.1 cut_intervals = (512, 256, 128, 64) continuation = True # Nia to be used in the experiment evos = [ GreyWolfOptimizer, SelfAdaptiveDifferentialEvolution, GeneticAlgorithm, EvolutionStrategyMpL, ParticleSwarmAlgorithm ] # Datasets dataset_names = [ 'libras_move', 'spectrometer', 'optical_digits', 'oil', 'ozone_level', 'arrhythmia', 'us_crime', 'yeast_ml8' ] datasets = fetch_datasets(verbose=True) with open('./results/results_all5.csv', 'w') as f: print( 'Algorithm,Dataset,Fold,Accuracy,Fscore,TrainingTime,NoFeatures,Solution' ) print( 'Algorithm,Dataset,Fold,Accuracy,Fscore,TrainingTime,NoFeatures,Solution', file=f) # For each dataset for dataset_name in dataset_names: dataset = datasets[dataset_name] scaler = MinMaxScaler() # Scale it dataset.data = scaler.fit_transform(dataset.data) skf = StratifiedKFold(n_splits=10,
def model(boosting_name, data_name, classifier_name, cv_name, mode): """ 模板方法 :param boosting_name: 集成学习的方法 :param data_name: 数据集名称 :param classifier_name: 使用的基分类器 :param cv_name: 交叉验证模式 :param mode: 采样模式 :return: """ # 加载数据 if data_name in fetch_datasets().keys(): dataset = fetch_datasets()[data_name] X = dataset.data y = dataset.target print(Counter(y)) else: # 加载自定义数据 df = pd.read_csv('../imbalanced_data/%s.csv' % data_name, header=None) array = df.values.astype(float) X = array[:, 0:array.shape[1] - 1] y = array[:, -1] print(Counter(y)) base = None if classifier_name == 'CART': base = tree.DecisionTreeClassifier(max_depth=8, random_state=42, min_samples_split=10) elif classifier_name == 'svm': base = svm.SVC() else: pass # 起始时间 start_time = time.time() cv = None if cv_name == 'StratifiedKFold': cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) elif cv_name == 'RepeatedStratifiedKFold': cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=42) else: pass mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) # 插值点(保证每一折的fpr和tpr相同) aucs = [] for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) classifier = None if boosting_name == 'CART': classifier = base elif boosting_name == 'Bagging': classifier = BaggingClassifier(base_estimator=base, n_estimators=40) elif boosting_name == 'BalancedBagging': classifier = BalancedBaggingClassifier(base_estimator=base, ratio='auto', replacement=True, random_state=42) elif boosting_name == 'Adaboost': classifier = AdaBoostClassifier(base_estimator=base, n_estimators=40) elif boosting_name == 'Random Forest': classifier = RandomForestClassifier(max_depth=8, min_samples_split=10, n_estimators=40, random_state=42) elif boosting_name == 'EasyEnsemble': model_under(boosting_name, X_train_minmax, y[train], X_test_minmax, y[test]) continue elif boosting_name == 'BalanceCascade': model_under(boosting_name, X_train_minmax, y[train], X_test_minmax, y[test]) continue elif boosting_name == 'SMOTEBoost': classifier = SMOTEBoost(rate=100, n_estimators=40, weak_estimator=base, random_state=42, class_dist=False) elif boosting_name == 'RUSBoost': classifier = RUSBoost(ratio=50, n_estimators=40, weak_estimator=base, random_state=42, class_dist=False) else: pass classifier.fit(X_train_minmax, y[train]) # 采样 predict = classifier.predict(X_test_minmax) probability = classifier.predict_proba(X_test_minmax)[:, 1] # 指标计算 precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) accuracy = metrics.accuracy_score(y[test], predict) # -------------step6.计算每一折的ROC曲线和PR曲线上的点 ------------- fpr, tpr, thresholds = metrics.roc_curve(y[test], probability) # 对mean_tpr在mean_fpr处进行插值,通过scipy包调用interp()函数 mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 # 为什么? roc_auc = metrics.auc(fpr, tpr) aucs.append(roc_auc) # write2dic fill_dic('precision', boosting_name, precision) fill_dic('recall', boosting_name, recall) fill_dic('f1', boosting_name, f1) fill_dic('auc', boosting_name, auc) fill_dic('gmean', boosting_name, gmean) if boosting_name != 'EasyEnsemble' and boosting_name != 'BalanceCascade': # 将frp和tpr写入文件 # 在mean_fpr100个点,每个点处插值插值多次取平均 mean_tpr /= cv.get_n_splits() # 坐标最后一个点为(1,1) mean_tpr[-1] = 1.0 # 计算平均AUC值 mean_auc = metrics.auc(mean_fpr, mean_tpr) # 将平均fpr和tpr拼接起来存入文件 filename = './ROC/{data_name}/{mode}/{base_classifier}/{sampler}.csv'. \ format(data_name=data_name, mode=mode, base_classifier=classifier_name, sampler=boosting_name) # 将文件路径分割出来 file_dir = os.path.split(filename)[0] # 判断文件路径是否存在,如果不存在,则创建,此处是创建多级目录 if not os.path.isdir(file_dir): os.makedirs(file_dir) # # 然后再判断文件是否存在,如果不存在,则创建 # if not os.path.exists(filename): # os.system(r'touch %s' % filename) # 将结果拼合起来 all = np.c_[mean_fpr, mean_tpr] np.savetxt(filename, all, delimiter=',', fmt='%f') print('%s building id transforming took %fs!' % (boosting_name, time.time() - start_time))
from imblearn.under_sampling import NearMiss from imblearn.metrics import classification_report_imbalanced from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report from sklearn.ensemble import RandomForestClassifier import numpy as np def print_results(headline, true_value, pred): print(headline) print("accuracy: {}").format(accuracy_score(true_value, pred)) print("precision: {}").format(precision_score(true_value, pred)) print("recall: {}").format(recall_score(true_value, pred)) print("f1: {}").format(f1_score(true_value, pred)) data = fetch_datasets()['wine_quality'] X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], random_state=2) #build normal model pipeline = make_pipeline(RandomForestClassifier(random_state=42)) model = piepline.fit(X_train, y_test) prediction = model.predict(X_test) #build model with SMOTE smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), RandomForestClassifier(random_state=42)) smote_model = smote_pipeline.fit(X_train, y_train) smote_prediction = smote_model.predict(X_test)
def test(): dic = {'recall': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'precision': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'f1': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'auc': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'gmean': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}} results = prettytable.PrettyTable(["Classifier", "Precision", 'Recall', 'AUC', 'F-measure', 'G-mean']) # 加载数据 dataset = fetch_datasets()['satimage'] X = dataset.data y = dataset.target print(Counter(y)) # 随机种子,保证每次实验结果相同 np.random.seed(42) # -------------------------------------------CART---------------------------------------------------- # 起始时间 start_time = time.time() # 交叉验证CART cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # initialize CART cart = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) # 归一化 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练 cart.fit(X_train_minmax, y[train]) # 测试 predict = cart.predict(X_test_minmax) probability = cart.predict_proba(X_test_minmax) cart_auc = metrics.roc_auc_score(y[test], probability[:, 1]) cart_precision = metrics.precision_score(y[test], predict) cart_recall = metrics.recall_score(y[test], predict) if cart_precision == 0: cart_f1 = 0 else: cart_f1 = 2 * (cart_precision * cart_recall) / (cart_precision + cart_recall) cart_gmean = geometric_mean_score(y[test], predict) dic['precision']['CART'].append(cart_precision) dic['recall']['CART'].append(cart_recall) dic['f1']['CART'].append(cart_f1) dic['auc']['CART'].append(cart_auc) dic['gmean']['CART'].append(cart_gmean) print('CART building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------------SMOTE---------------------------------------------------------- # 起始时间 start_time = time.time() # 交叉验证 cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=42) for train, test in cv.split(X, y): # preprocess scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # initialize sampler sb = SMOTE(N=100, k_neighbors=5, random_state=42) # sampling X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) # initialize classifier model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) # model = svm.SVC(class_weight={1: 20}) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['SMOTE'].append(precision) dic['recall']['SMOTE'].append(recall) dic['f1']['SMOTE'].append(f1) dic['auc']['SMOTE'].append(auc) dic['gmean']['SMOTE'].append(gmean) print('SMOTE building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------Borderline-SMOTE1---------------------------------------------------- # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 初始化采样器 sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline1') # 采样 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['Border1'].append(precision) dic['recall']['Border1'].append(recall) dic['f1']['Border1'].append(f1) dic['auc']['Border1'].append(auc) dic['gmean']['Border1'].append(gmean) print('BorderSmote1 building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------Borderline-SMOTE2---------------------------------------------------- # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 初始化采样器 sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline2') # 采样 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['Border2'].append(precision) dic['recall']['Border2'].append(recall) dic['f1']['Border2'].append(f1) dic['auc']['Border2'].append(auc) dic['gmean']['Border2'].append(gmean) print('BorderSmote2 building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------ADASYN--------------------------------------------------------------- # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练 sb = ADASYN(bata=0.1, k_neighbors=5, random_state=42) # 预测 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['ADASYN'].append(precision) dic['recall']['ADASYN'].append(recall) dic['f1']['ADASYN'].append(f1) dic['auc']['ADASYN'].append(auc) dic['gmean']['ADASYN'].append(gmean) print('ADASYN building id transforming took %fs!' % (time.time() - start_time)) # ------------------------------------------------Safe-Level-SMOTE---------------------------------------------- cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练 sb = SafeLevelSMOTE(N=100, k_neighbors=5, random_state=42) # 预测 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['Safe-level'].append(precision) dic['recall']['Safe-level'].append(recall) dic['f1']['Safe-level'].append(f1) dic['auc']['Safe-level'].append(auc) dic['gmean']['Safe-level'].append(gmean) print('Safe-level building id transforming took %fs!' % (time.time() - start_time)) # display results.add_row(['CART', np.mean(np.array(dic['precision']['CART'])), np.mean(np.array(dic['recall']['CART'])), np.mean(np.array(dic['auc']['CART'])), np.mean(np.array(dic['f1']['CART'])), np.mean(np.array(dic['gmean']['CART']))]) results.add_row(['SMOTE', np.mean(np.array(dic['precision']['SMOTE'])), np.mean(np.array(dic['recall']['SMOTE'])), np.mean(np.array(dic['auc']['SMOTE'])), np.mean(np.array(dic['f1']['SMOTE'])), np.mean(np.array(dic['gmean']['SMOTE']))]) results.add_row(['Border1', np.mean(np.array(dic['precision']['Border1'])), np.mean(np.array(dic['recall']['Border1'])), np.mean(np.array(dic['auc']['Border1'])), np.mean(np.array(dic['f1']['Border1'])), np.mean(np.array(dic['gmean']['Border1']))]) results.add_row(['Border2', np.mean(np.array(dic['precision']['Border2'])), np.mean(np.array(dic['recall']['Border2'])), np.mean(np.array(dic['auc']['Border2'])), np.mean(np.array(dic['f1']['Border2'])), np.mean(np.array(dic['gmean']['Border2']))]) results.add_row(['ADASYN', np.mean(np.array(dic['precision']['ADASYN'])), np.mean(np.array(dic['recall']['ADASYN'])), np.mean(np.array(dic['auc']['ADASYN'])), np.mean(np.array(dic['f1']['ADASYN'])), np.mean(np.array(dic['gmean']['ADASYN']))]) results.add_row(['Safe-level', np.mean(np.array(dic['precision']['Safe-level'])), np.mean(np.array(dic['recall']['Safe-level'])), np.mean(np.array(dic['auc']['Safe-level'])), np.mean(np.array(dic['f1']['Safe-level'])), np.mean(np.array(dic['gmean']['Safe-level']))]) print(results)
def test_fetch_error(filter_data, err_msg): with pytest.raises(ValueError, match=err_msg): fetch_datasets(filter_data=filter_data)
from imblearn.datasets import fetch_datasets from sklearn.model_selection import train_test_split, cross_val_score, KFold import numpy datasets = [ 'ecoli', 'optical_digits', 'satimage', 'pen_digits', 'abalone', 'sick_euthyroid', 'spectrometer', 'car_eval_34', 'isolet', 'us_crime', 'yeast_ml8', 'scene', 'libras_move', 'thyroid_sick', 'coil_2000', 'arrhythmia', 'solar_flare_m0', 'oil', 'car_eval_4', 'wine_quality', 'letter_img', 'yeast_me2', 'webpage', 'ozone_level', 'mammography', 'protein_homo', 'abalone_19' ] for dataset in datasets: object = fetch_datasets(data_home='./data/')[dataset] X, y = object.data, object.target train_X, test_X, train_y, test_y = train_test_split( X, y) # splits 75%/25% by default numpy.savez('./data/zendo_stable/' + dataset + '.npz', train_X=train_X, test_X=test_X, train_y=train_y, test_y=test_y)
horizontalalignment="center", color="white" if cm[i, j] > thresh else "black", ) ax.set_ylabel("True label") ax.set_xlabel("Predicted label") ############################################################################### # Load an imbalanced dataset ############################################################################### # We will load the UCI SatImage dataset which has an imbalanced ratio of 9.3:1 # (number of majority sample for a minority sample). The data are then split # into training and testing. satimage = fetch_datasets()["satimage"] X, y = satimage.data, satimage.target X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) ############################################################################### # Classification using a single decision tree ############################################################################### # We train a decision tree classifier which will be used as a baseline for the # rest of this example. ############################################################################### # The results are reported in terms of balanced accuracy and geometric mean # which are metrics widely used in the literature to validate model trained on
def run_eval(dataset, base_learners, methods): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) else: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = [ "feature_" + str(i) for i in range(0, data['data'].shape[1]) ] X = data['data'] y = data['target'] y[y != 1] = 0 processes = [] for method in methods: p = Process(target=train_classifier, args=(X, y, base_learners, method, cl_names)) # Passing the list p.start() processes.append(p) for p in processes: p.join() N = len(methods) ind = numpy.arange(N) # the x locations for the groups width = 0.35 # the width of the bars: can also be len(x) sequence raw_data = dict() for method in methods: with open('temp_features/' + method, 'rb') as filehandle: # read the data as binary data stream model = pickle.load(filehandle) # print (method, model.feature_importances_) raw_data[method] = model.feature_importances_ f_num = len(model.feature_importances_) index = ["Feature " + str(k) for k in range(1, f_num + 1)] # index = ["Atrribute 1","Atrribute 2","Atrribute 3","Atrribute 4","Atrribute 5","Atrribute 6"] df = pd.DataFrame(raw_data, index=index) df = df.transpose() ax = df.plot.bar(stacked=True, alpha=0.75, rot=25) ax.set_ylabel("Feature importance") ax.set_xlabel("Methods") ax.legend(loc='center left', bbox_to_anchor=(0.1, 01.07), ncol=3) # here is the magic ax.figure.savefig('Images/features/' + dataset + '.png', bbox_inches='tight', dpi=200)
pass if __name__ == '__main__': import time import prettytable from collections import Counter from sklearn import tree from sklearn import metrics from sklearn import preprocessing from imblearn.datasets import fetch_datasets from imblearn.metrics import geometric_mean_score from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold start_time = time.time() dataset = fetch_datasets()['oil'] X = dataset.data y = dataset.target # print(Counter(y)) cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) dic = {'recall': [], 'precision': [], 'f1': [], 'auc': [], 'gmean': []} results = prettytable.PrettyTable(["Classifier", "Precision", 'Recall', 'F-measure', 'AUC', 'G-mean']) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练 sb = BorderSMOTE(N=100, m_neighbors=9, k_neighbors=5, random_state=42, kind='borderline1')
plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') ozone = fetch_datasets()['ozone_level'] X, y = ozone.data, ozone.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) bagging = BaggingClassifier(random_state=0) balanced_bagging = BalancedBaggingClassifier(random_state=0) print('Class distribution of the training set: {}'.format(Counter(y_train))) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) print('Class distribution of the test set: {}'.format(Counter(y_test))) print('Classification results using a bagging classifier on imbalanced data') y_pred_bagging = bagging.predict(X_test)
def obtain_data(dataset_name): dataset = fetch_datasets()[dataset_name] return dataset.data, dataset.target
def run_eval(dataset, folds, iterations, baseL, methods): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "rain_aus": X, y, cl_names = load_rain_aus() elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) else: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])] X = data['data'] y = data['target'] y[y != 1] = 0 unique_attr = set([i.split("?")[0] for i in cl_names]) print(dataset + "\t" + str(len(unique_attr)) + "\t" + str(f'{sum(abs(y[y == 1])):,}') + "\t" + str( f'{len(abs(y[y != 1])):,}') + "\t1:" + str(format(len(abs(y[y != 1])) / sum(y[y == 1]), '.2f'))) list_of_dicts = [] list_of_dicts_stats = [] for t_dict in range(0, len(methods)): list_of_dicts.append(defaultdict(dict)) list_of_dicts_stats.append(defaultdict(dict)) for weak_learners in baseL: for item in list_of_dicts: item[weak_learners] = defaultdict(list) for weak_learners in baseL: for item in list_of_dicts_stats: item[weak_learners] = defaultdict(list) for samples in range(0, iterations): sss = StratifiedKFold(n_splits=folds, shuffle=True, random_state=int(time.time())) for weak_learners in baseL: print("iteration=", samples, " weak learners=", weak_learners) # for weak_learners in baseL: for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] processes = [] for method in methods: p = Process(target=train_and_predict, args=(X_train, y_train, X_test, weak_learners, method, cl_names)) p.start() processes.append(p) for p in processes: p.join() for index, method in enumerate(methods): with open('temp_preds/' + method, 'rb') as filehandle: list_of_dicts[index] = update_performance_stats( calculate_performance(y_test, pickle.load(filehandle)), list_of_dicts[index], weak_learners ) with open('temp_preds/stats_' + method, 'rb') as filehandle: list_of_dicts_stats[index] = update_resource_stats(pickle.load(filehandle), list_of_dicts_stats[index], weak_learners, method ) plot_single_dataset(methods, list_of_dicts, "Images/Performance/" + dataset + "/", baseL) plot_resource_stats_time(methods, list_of_dicts_stats, "Images/Performance/" + dataset + "/Resource/", baseL) plot_resource_stats_scores(methods, list_of_dicts_stats, "Images/Performance/" + dataset + "/Resource/", baseL) return list_of_dicts, list_of_dicts_stats
return X_resampled, y_resampled if __name__ == '__main__': import time import prettytable from collections import Counter from sklearn import tree from sklearn import metrics from sklearn import preprocessing from imblearn.datasets import fetch_datasets from imblearn.metrics import geometric_mean_score from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold start_time = time.time() dataset = fetch_datasets()['satimage'] X = dataset.data y = dataset.target print(Counter(y)) cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) dic = {'recall': [], 'precision': [], 'f1': [], 'auc': [], 'gmean': []} results = prettytable.PrettyTable( ["Classifier", "Precision", 'Recall', 'F-measure', 'AUC', 'G-mean']) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练
def run_eval(dataset, baseL, methods): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) else: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])] X = data['data'] y = data['target'] y[y != 1] = 0 print("===============-- " + dataset + " --===============") processes = [] for method in methods: p = Process(target=train_and_predict, args=(X, y, baseL, method)) p.start() processes.append(p) for p in processes: p.join() list_of_dicts = [] for method in methods: with open('temp_preds_adaac/' + method, 'rb') as filehandle: list_of_dicts.append(update_stats(pickle.load(filehandle))) plot_amort_vs_non_amort(methods, list_of_dicts, baseL, "Images/Amort_vs_non_amort/" + dataset + "/") return list_of_dicts
} for data in dataset: print("dataset : ", data) ''' fetch_data = fetch_datasets()[data] X = fetch_data.data y = fetch_data.target normalization_object = Normalizer() X = normalization_object.fit_transform(X) labelencoder = LabelEncoder() y = labelencoder.fit_transform(y) ''' fetch_data = fetch_datasets()[data] X = fetch_data.data y = fetch_data.target Standard_object = StandardScaler() X = Standard_object.fit_transform(X) labelencoder = LabelEncoder() y = labelencoder.fit_transform(y) value, counts = np.unique(y, return_counts=True) if counts[0]>= counts[1]: fraction = int((counts[1]/counts[0])*100) else:
from sklearn.model_selection import KFold from imblearn.datasets import fetch_datasets from photonai.base import Hyperpipe, PipelineElement, OutputSettings from photonai.optimization import FloatRange, Categorical, IntegerRange # example of imbalanced dataset dataset = fetch_datasets()["coil_2000"] X, y = dataset.data, dataset.target # ratio class 0: 0.06%, class 1: 0.94% my_pipe = Hyperpipe( "basic_svm_pipe_no_performance", optimizer="random_grid_search", optimizer_params={"n_configurations": 10}, metrics=["accuracy", "precision", "recall"], best_config_metric="recall", outer_cv=KFold(n_splits=3), inner_cv=KFold(n_splits=5), verbosity=1, output_settings=OutputSettings(project_folder="./tmp/"), ) # ADD ELEMENTS TO YOUR PIPELINE my_pipe += PipelineElement("StandardScaler") my_pipe += PipelineElement( "PCA", hyperparameters={"n_components": IntegerRange(5, 20)}, test_disabled=True )
def run_eval(dataset, base_learners, methods): if dataset == "wilt": X, y, cl_names = load_wilt() elif dataset == "adult": X, y, cl_names = load_adult() elif dataset == "diabetes": X, y, cl_names = load_diabetes() elif dataset == "phoneme": X, y, cl_names = load_phoneme() elif dataset == "mushroom": X, y, cl_names = load_mushroom() elif dataset == "electricity": X, y, cl_names = load_electricity() elif dataset == "speeddating": X, y, cl_names = load_speed_dating() elif dataset == "credit": X, y, cl_names = load_credit() elif dataset == "eeg_eye": X, y, cl_names = load_eeg_eye() elif dataset == "spam": X, y, cl_names = load_spam() elif dataset == "skin": X, y, cl_names = load_skin() elif dataset == "bank": X, y, cl_names = load_bank() elif dataset == "kdd": X, y, cl_names = load_kdd() elif dataset == "landsatM": X, y, cl_names = load_mat_data(dataset) elif dataset == "musk2": X, y, cl_names = load_mat_data(dataset) elif dataset == "spliceM": X, y, cl_names = load_mat_data(dataset) elif dataset == "semeion_orig": X, y, cl_names = load_mat_data(dataset) elif dataset == "waveformM": X, y, cl_names = load_mat_data(dataset) else: from imblearn import datasets data = datasets.fetch_datasets()[dataset] cl_names = ["feature_" + str(i) for i in range(0, data['data'].shape[1])] X = data['data'] y = data['target'] y[y != 1] = 0 list_of_scores = [] processes = [] for method in methods: p = Process(target=train_classifier, args=(X, y, base_learners, method, cl_names)) # Passing the list p.start() processes.append(p) for p in processes: p.join() for method in methods: with open('temp/' + method, 'rb') as filehandle: # read the data as binary data stream list_of_scores.append(pickle.load(filehandle)) y[y != 1] = -1 for idx in range(0, len(list_of_scores)): list_of_scores[idx] = numpy.array(list_of_scores[idx]) * y overall_confs = [] positive_confs = [] negative_confs = [] for conf in list_of_scores: overall_confs.append(conf) positive_confs.append(conf[y == 1]) negative_confs.append(conf[y == -1]) num_bins = 40 fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20, 4)) plt.rcParams.update({'font.size': 12}) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'dimgray', 'peru', 'hotpink', 'tomato'] default_cycler = (cycler(color=colors) + cycler(linestyle=['-', (0, (1, 1)), '--', '-.', (0, (5, 10)), (0, (5, 1)), '-', (0, (1, 1)), '--', '-.', (0, (5, 10))])) ax1.set_prop_cycle(default_cycler) ax2.set_prop_cycle(default_cycler) ax3.set_prop_cycle(default_cycler) ax1.set_title("Positive CDF") ax1.grid(True) ax1.set_xlim(-1, 1) ax2.set_xlim(-1, 1) ax3.set_xlim(-1, 1) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'dimgray', 'peru', 'hotpink', 'tomato', 'indigo', 'lightskyblue'] output = defaultdict(list) for idx in range(0, len(positive_confs)): pos_conf = positive_confs[idx] counts_positives, bin_edges_positives = numpy.histogram(pos_conf, bins=num_bins, normed=True) cdf_positives = numpy.cumsum(counts_positives) # ax1.plot(bin_edges_positives[:-1], cdf_positives / cdf_positives[-1], label=methods[idx],color=colors[idx]) ax1.plot(bin_edges_positives[:-1], cdf_positives / cdf_positives[-1], label=methods[idx]) output[methods[idx]].append(bin_edges_positives[:-1]) output[methods[idx]].append(cdf_positives) # ax1.legend(loc='best') ax1.set_xlabel("Margin") ax1.set_ylabel("Cumulative Distribution") ax1.axhline(0, color='black') ax1.axvline(0, color='black') ax2.grid(True) ax2.axhline(0, color='black') ax2.axvline(0, color='black') ax2.set_title("Negative CDF") for idx in range(0, len(negative_confs)): if idx == 0: ax2.set_ylabel("Cumulative Distribution") ax2.set_xlabel("Margin") neg_conf = negative_confs[idx] counts_negatives, bin_edges_negatives = numpy.histogram(neg_conf, bins=num_bins, normed=True) cdf_negatives = numpy.cumsum(counts_negatives) # ax2.plot(bin_edges_negatives[:-1], cdf_negatives / cdf_negatives[-1], label=methods[idx],color=colors[idx]) ax2.plot(bin_edges_negatives[:-1], cdf_negatives / cdf_negatives[-1], label=methods[idx]) output[methods[idx]].append(bin_edges_negatives[:-1]) output[methods[idx]].append(cdf_negatives) ax3.grid(True) ax3.axhline(0, color='black') ax3.axvline(0, color='black') ax3.set_title("Overall CDF") for idx in range(0, len(negative_confs)): if idx == 0: ax3.set_ylabel("Cumulative Distribution") ax3.set_xlabel("Margin") over_conf = overall_confs[idx] counts_overall, bin_edges_overall = numpy.histogram(over_conf, bins=num_bins, normed=True) cdf_overall = numpy.cumsum(counts_overall) # ax3.plot(bin_edges_overall[:-1], cdf_overall / cdf_overall[-1], label=methods[idx], color=colors[idx]) ax3.plot(bin_edges_overall[:-1], cdf_overall / cdf_overall[-1], label=methods[idx]) output[methods[idx]].append(bin_edges_overall[:-1]) output[methods[idx]].append(cdf_overall) plt.legend(loc='upper center', bbox_to_anchor=(-0.7, 1.305), ncol=5) if not os.path.exists("Images/cdf_plots/" + dataset): os.makedirs("Images/cdf_plots/" + dataset) plt.savefig("Images/cdf_plots/" + dataset + "/cdf_" + str(base_learners) + ".png", bbox_inches='tight', dpi=200) return output
from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier random_seed = 3 # ## Preparing the data # In[2]: np.random.seed(random_seed) # In[3]: libras = imb_datasets.fetch_datasets()['libras_move'] X, y = libras['data'], libras['target'] # In[4]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # ## Fitting a pipeline # In[5]: oversampler = sv.MulticlassOversampling(sv.distance_SMOTE()) classifier = KNeighborsClassifier(n_neighbors=5) # In[6]: