def predefined_ops(): '''return dict of user defined none-default instances of operators ''' clean = { 'clean': Cleaner(dtype_filter='not_datetime', na1='null', na2='mean', drop_uid=True), 'cleanNA': Cleaner(dtype_filter='not_datetime', na1=None, na2=None), 'cleanMean': Cleaner(dtype_filter='not_datetime', na1='most_frequent', na2='mean'), 'cleanMn': Cleaner(dtype_filter='not_datetime', na1='missing', na2='mean'), } # encode = { 'woe8': WoeEncoder(max_leaf_nodes=8), 'woe5': WoeEncoder(max_leaf_nodes=5), 'woeq8': WoeEncoder(q=8), 'woeq5': WoeEncoder(q=5), 'woeb5': WoeEncoder(bins=5), 'woem': WoeEncoder(mono=True), 'oht': OhtEncoder(), 'ordi': OrdiEncoder(), # 'bin10': BinEncoder(bins=10, int_bins=True), # 10 bin edges encoder # 'bin5': BinEncoder(bins=5, int_bins=True), # 5 bin edges encoder # 'binm10': BinEncoder(max_leaf_nodes=10, # int_bins=True), # 10 bin tree cut edges encoder # 'binm5': BinEncoder(max_leaf_nodes=5, # int_bins=True), # 5 bin tree cut edges encoder } resample = { # over_sampling # under sampling controlled methods 'runder': RandomUnderSampler(), 'nearmiss': NearMiss(version=3), 'pcart': InstanceHardnessThreshold(), # clean outliers 'inlierForest': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'IsolationForest', 'contamination': 0.1 }), 'inlierLocal': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'LocalOutlierFactor', 'contamination': 0.1 }), 'inlierEllip': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'EllipticEnvelope', 'contamination': 0.1 }), 'inlierOsvm': FunctionSampler(_outlier_rejection, kw_args={ 'method': 'OneClassSVM', 'contamination': 0.1 }), } scale = { 'stdscale': StandardScaler(), 'minmax': MinMaxScaler(), 'absmax': MaxAbsScaler(), 'rscale': RobustScaler(quantile_range=(10, 90)), 'quantile': QuantileTransformer(), # uniform distribution 'power': PowerTransformer(), # Gaussian distribution 'norm': Normalizer(), # default L2 norm # scale sparse data 'maxabs': MaxAbsScaler(), 'stdscalesp': StandardScaler(with_mean=False), } # feature construction feature_c = { 'pca': PCA(whiten=True), 'spca': SparsePCA(n_jobs=-1), 'ipca': IncrementalPCA(whiten=True), 'kpca': KernelPCA(kernel='rbf', n_jobs=-1), 'poly': PolynomialFeatures(degree=2), # kernel approximation 'Nys': Nystroem(random_state=0), 'rbf': RBFSampler(random_state=0), 'rfembedding': RandomTreesEmbedding(n_estimators=10), 'LDA': LinearDiscriminantAnalysis(), 'QDA': QuadraticDiscriminantAnalysis(), } # select from model feature_m = { 'fwoe': SelectFromModel(WoeEncoder(max_leaf_nodes=5)), 'flog': SelectFromModel(LogisticRegression(penalty='l1', solver='saga', C=1e-2)), 'fsgd': SelectFromModel(SGDClassifier(penalty="l1")), 'fxgb': SelectFromModel( XGBClassifier(n_jobs=-1, booster='gbtree', max_depth=2, n_estimators=50), ), 'frf': SelectFromModel(ExtraTreesClassifier(n_estimators=50, max_depth=2)), # fixed number of features 'fxgb20': SelectFromModel(XGBClassifier(n_jobs=-1, booster='gbtree'), max_features=20), 'frf20': SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5), max_features=20), 'frf10': SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5), max_features=10), 'fRFElog': RFE(LogisticRegression(penalty='l1', solver='saga', C=1e-2), step=0.1), 'fRFExgb': RFE(XGBClassifier(n_jobs=-1, booster='gbtree'), step=0.1), } # Univariate feature selection feature_u = { 'fchi2': GenericUnivariateSelect(chi2, 'percentile', 25), 'fMutualclf': GenericUnivariateSelect(mutual_info_classif, 'percentile', 25), 'fFclf': GenericUnivariateSelect(f_classif, 'percentile', 25), } imp = { "impXGB": XGBClassifier(n_jobs=-1, booster='gbtree', max_depth=2, n_estimators=50), "impRF": ExtraTreesClassifier(n_estimators=100, max_depth=2) } instances = {} instances.update(**clean, **encode, **scale, **feature_c, **feature_m, **feature_u, **resample, **imp) return instances
def test_iht_fit_sample_wrong_class_obj(): from sklearn.cluster import KMeans est = KMeans() iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) assert_raises_regex(ValueError, "Invalid parameter `estimator`", iht.fit_sample, X, Y)
def test_iht_fit_resample(): iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (12, 2) assert y_resampled.shape == (12, )
sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax[1]) ax[1].set_title('Resampling using {}'.format(sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude # samples. All samples which are classified with a low probability will be # removed. fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title('Linear SVC with y={}'.format(Counter(y))) sampler = InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression( solver='lbfgs', multi_class='auto')) clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax2) ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax3) ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__)) fig.tight_layout() plt.show()
def test_iht_fit_resample_wrong_class_obj(): from sklearn.cluster import KMeans est = KMeans() iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) with pytest.raises(ValueError, match="Invalid parameter `estimator`"): iht.fit_resample(X, Y)
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train, AI_ova_X_train, AI_ova_y_train, AW_ova_X_train, AW_ova_y_train, CC_ova_X_train, CC_ova_y_train, QA_ova_X_train, QA_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN( ), ADASYN(), ADASYN() AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ALLKNN": AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN( ), AllKNN(), AllKNN(), AllKNN(), AllKNN() AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "CNN": AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ENN": AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "IHT": AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NCR": AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AA_ova_y_train = [ 0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train ] AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_ova_y_train = [ 0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train ] AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_ova_y_train = [ 0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train ] AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_ova_y_train = [ 0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train ] CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_ova_y_train = [ 0 if i == "Queued/Awaiting Assignment" else 1 for i in QA_ova_y_train ] QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NM": AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss( ), NearMiss(), NearMiss() AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "OSS": AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection( ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RENN": AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTE": AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE( ), SMOTE() AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "BSMOTE": AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE( ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTEENN": AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN( ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN() AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTETOMEK": AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek( ), SMOTETomek(), SMOTETomek(), SMOTETomek() AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "TOMEK": AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks( ), TomekLinks(), TomekLinks(), TomekLinks() AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ROS": AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler( ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RUS": AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler( ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train, QA_ova_y_train) return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
def under_sample_InstanceHardnessThreshold(train_inputs, train_targets): sampler = InstanceHardnessThreshold(random_state=32) train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets) return train_inputs, train_targets
n_clusters_per_class=1, random_state=0, weights=[0.65, 0.3, 0.05], n_repeated=0, n_redundant=0) print('采样前: {}'.format(Counter(y).items())) # 下采样 sampler = TomekLinks(ratio='auto', random_state=0) sampler1 = EditedNearestNeighbours(random_state=0) sampler2 = RepeatedEditedNearestNeighbours(random_state=0, max_iter=500) sampler3 = AllKNN(random_state=0) sampler4 = CondensedNearestNeighbour(random_state=0) sampler5 = OneSidedSelection(random_state=0, n_seeds_S=5) sampler6 = NeighbourhoodCleaningRule(random_state=0) sampler7 = InstanceHardnessThreshold(random_state=0, cv=10) for x in [ sampler, sampler1, sampler2, sampler3, sampler4, sampler5, sampler6, sampler7 ]: X_new, y_new = x.fit_sample(X, y) print('采样后: {}'.format(Counter(y_new).items())) # 拟合 y_pred = SVC().fit(X_new, y_new).predict(X) print(accuracy_score(y, y_pred)) # 不重新采样的效果 y_pred1 = y_pred = SVC().fit(X, y).predict(X) print('不重新采样的 acc: {}'.format(accuracy_score(y, y_pred)))
axs = [a for ax in axs for a in ax] for ax, sampling_strategy in zip(axs, (0, { 1: 25, 0: 10 }, { 1: 14, 0: 10 }, { 1: 10, 0: 10 })): if sampling_strategy == 0: c0, c1 = plot_resampling(ax, X_vis, y, 'Original set') else: iht = InstanceHardnessThreshold(sampling_strategy=sampling_strategy, estimator=LogisticRegression(), return_indices=True) X_res, y_res, idx_res = iht.fit_resample(X, y) X_res_vis = pca.transform(X_res) plot_resampling( ax, X_res_vis, y_res, 'Instance Hardness Threshold ({})'.format(sampling_strategy)) # plot samples which have been removed idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_res) c3 = ax.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.2, label='Removed samples') plt.figlegend((c0, c1, c3), ('Class #0', 'Class #1', 'Removed samples'), loc='lower center',
def instance_hardness_threshold_optimized(): return InstanceHardnessThreshold(estimator=GradientBoostingClassifier(), sampling_strategy='auto', random_state=0, cv=6, n_jobs=-1)
pca = PCA(n_components=2) X_vis = pca.fit_transform(X) # Two subplots, unpack the axes array immediately f, axs = plt.subplots(2, 2) axs = [a for ax in axs for a in ax] for ax, ratio in zip(axs, (0, {1: 25, 0: 10}, {1: 14, 0: 10}, {1: 10, 0: 10})): if ratio == 0: c0, c1 = plot_resampling(ax, X_vis, y, 'Original set') else: iht = InstanceHardnessThreshold(ratio=ratio, estimator=LogisticRegression(), return_indices=True) X_res, y_res, idx_res = iht.fit_sample(X, y) X_res_vis = pca.transform(X_res) plot_resampling(ax, X_res_vis, y_res, 'Instance Hardness Threshold ({})'.format(ratio)) # plot samples which have been removed idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_res) c3 = ax.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.2, label='Removed samples') plt.figlegend((c0, c1, c3), ('Class #0', 'Class #1', 'Removed samples'), loc='lower center', ncol=3, labelspacing=0.) plt.tight_layout(pad=3)
def instance_hardness_thresold(X, y): iht = InstanceHardnessThreshold(random_state=42) X_res, y_res = iht.fit_resample(X, y) return X_res, y_res
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train, PM_ova_X_train, PM_ova_y_train, SC_ova_X_train, SC_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN() AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ALLKNN": AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN() AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "CNN": AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ENN": AP_enn, PM_enn, SC_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "IHT": AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NCR": AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AP_ova_y_train = [ 0 if i == "Add penalty" else 1 for i in AP_ova_y_train ] AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train] PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_ova_y_train = [ 0 if i == "Send for Credit Collection" else 1 for i in SC_ova_y_train ] SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NM": AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss() AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "OSS": AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RENN": AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTE": AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE() AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "BSMOTE": AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTEENN": AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN() AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTETOMEK": AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek() AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "TOMEK": AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks() AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ROS": AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RUS": AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train, SC_ova_y_train) return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
def run_basic_svm(X_train, y_train, selected_features, scorers, refit_scorer_name, subset_share=0.1, n_splits=10, parameters=None): '''Run an extensive grid search over all parameters to find the best parameters for SVM Classifier. The search shall be done only with a subset of the data. Default subset is 0.1. Input is training and test data. subset_share=0.1''' #Create a subset to train on print("[Step 1]: Create a data subset") subset_min = 300 #Minimal subset is 100 samples. if subset_share * X_train.shape[0] < subset_min: number_of_samples = subset_min print("minimal number of samples used: ", number_of_samples) else: number_of_samples = subset_share * X_train.shape[0] X_train_subset, y_train_subset = modelutil.extract_data_subset( X_train, y_train, number_of_samples) print("Got subset sizes X train: {} and y train: {}".format( X_train_subset.shape, y_train_subset.shape)) print("[Step 2]: Define test parameters") if parameters is None: #If no parameters have been defined, then do full definition # Guides used from # https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines # Main set of parameters for the grid search run 1: Select scaler, sampler and kernel for the problem test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), ClusterCentroids(), RandomUnderSampler(), NearMiss(version=1), EditedNearestNeighbours(), AllKNN(), CondensedNearestNeighbour(random_state=0), InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression( solver='lbfgs', multi_class='auto')), SMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] # gamma default parameters param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var())) parameters = [ { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['linear', 'sigmoid'] }, { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['poly'], 'svm__degree': [2, 3] # Only relevant for poly }, { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['rbf'], 'svm__gamma': [param_scale, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] # Only relevant in rbf, default='auto'=1/n_features } ] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) else: print("Parameters defined in the input: ", parameters) # Main pipeline for the grid search pipe_run1 = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler()), ('sampling', modelutil.Nosampler()), ('feat', modelutil.ColumnExtractor(cols=None)), ('svm', SVC())]) print("Pipeline: ", pipe_run1) print("Stratified KFold={} used.".format(n_splits)) skf = StratifiedKFold(n_splits=n_splits) pipe_run1 = pipe_run1 params_run1 = parameters #params_debug #params_run1 grid_search_run1 = GridSearchCV(pipe_run1, params_run1, verbose=1, cv=skf, scoring=scorers, refit=refit_scorer_name, return_train_score=True, iid=True, n_jobs=-1).fit(X_train_subset, y_train_subset) results_run1 = modelutil.generate_result_table(grid_search_run1, params_run1, refit_scorer_name) print("Result size=", results_run1.shape) print("Number of NaN results: {}. Replace them with 0".format( np.sum(results_run1['mean_test_' + refit_scorer_name].isna()))) return grid_search_run1, params_run1, pipe_run1, results_run1
data_X = imp.fit_transform(data_X) scaler = StandardScaler() scaler.fit(data_X) data_X = scaler.transform(data_X) Xtrain, Xtest, Ytrain, Ytest = train_test_split(data_X, data_Y, test_size=0.33, random_state=42) ######################################################################## ######################################################################## ######################################################################## samplers = [ NearMiss(version=2, random_state=42), CondensedNearestNeighbour(random_state=42), EditedNearestNeighbours(random_state=42), RepeatedEditedNearestNeighbours(random_state=42), AllKNN(random_state=42), InstanceHardnessThreshold(random_state=42), NeighbourhoodCleaningRule(random_state=42), OneSidedSelection(random_state=42), RandomUnderSampler(random_state=42), TomekLinks(random_state=42) ] samplers_name = ['Near Miss Classifier', 'Condensed Nearest Neighbour Naive Bayes', 'Edited Nearest Neighbours', 'Repeated Edited Nearest Neighbours', 'All KNN', 'Instance Hardness Threshold', 'Neighbour hood Cleaning Rule' , 'OneSidedSelection', 'Random Under Sampler', 'TomekLinks(random_state=42)' ] params = {'n_estimators': 10, 'max_depth': 3, 'subsample': 0.5, 'learning_rate': 0.89, 'min_samples_leaf': 1, 'random_state': 5}
def pipe_main(pipe=None): '''pipeline construction using sklearn estimators, final step support only classifiers currently .. note:: data flows through a pipeline consisting of steps as below: raw data --> clean --> encoding --> scaling --> feature construction --> feature selection --> resampling --> final estimator see scikit-learn preprocess & estimators parameter ---- pipe - str - in the format of 'xx_xx' of which 'xx' means steps in pipeline, default None return ---- 1) pipeline instance of chosen steps 2) if pipe is None, a dict indicating possible choice of 'steps' ''' clean = { 'clean': Split_cls(dtype_filter='not_datetime', na1='null', na2=-999), 'cleanNA': Split_cls(dtype_filter='not_datetime', na1=None, na2=None), 'cleanMean': Split_cls(dtype_filter='not_datetime', na1='most_frequent', na2='mean'), } # encode = { 'woe': Woe_encoder(max_leaf_nodes=5), 'oht': Oht_encoder(), 'ordi': Ordi_encoder(), } resample = { # over_sampling 'rover': RandomOverSampler(), 'smote': SMOTE(), 'bsmote': BorderlineSMOTE(), 'adasyn': ADASYN(), # under sampling controlled methods 'runder': RandomUnderSampler(), 'nearmiss': NearMiss(version=3), 'pcart': InstanceHardnessThreshold(), # under sampling cleaning methods 'tlinks': TomekLinks(n_jobs=-1), 'oside': OneSidedSelection(n_jobs=-1), 'cleanNN': NeighbourhoodCleaningRule(n_jobs=-1), 'enn': EditedNearestNeighbours(n_jobs=-1), 'ann': AllKNN(n_jobs=-1), 'cnn': CondensedNearestNeighbour(n_jobs=-1), # clean outliers 'inlierForest': FunctionSampler(outlier_rejection, kw_args={'method': 'IsolationForest'}), 'inlierLocal': FunctionSampler(outlier_rejection, kw_args={'method': 'LocalOutlierFactor'}), 'inlierEllip': FunctionSampler(outlier_rejection, kw_args={'method': 'EllipticEnvelope'}), 'inlierOsvm': FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}), # combine 'smoteenn': SMOTEENN(), 'smotelink': SMOTETomek(), } scale = { 'stdscale': StandardScaler(), 'maxscale': MinMaxScaler(), 'rscale': RobustScaler(quantile_range=(10, 90)), 'qauntile': QuantileTransformer(), # uniform distribution 'power': PowerTransformer(), # Gaussian distribution 'norm': Normalizer(), # default L2 norm # scale sparse data 'maxabs': MaxAbsScaler(), 'stdscalesp': StandardScaler(with_mean=False), } # feature construction feature_c = { 'pca': PCA(whiten=True), 'spca': SparsePCA(normalize_components=True, n_jobs=-1), 'ipca': IncrementalPCA(whiten=True), 'kpca': KernelPCA(kernel='rbf', n_jobs=-1), 'poly': PolynomialFeatures(degree=2), 'rtembedding': RandomTreesEmbedding(n_estimators=10), 'LDA': LinearDiscriminantAnalysis(), 'QDA': QuadraticDiscriminantAnalysis(), } # select from model feature_m = { 'fwoe': SelectFromModel(Woe_encoder(max_leaf_nodes=5)), 'flog': SelectFromModel( LogisticRegressionCV(penalty='l1', solver='saga', scoring='roc_auc')), 'fsgd': SelectFromModel(SGDClassifier(penalty="l1")), 'fsvm': SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)), 'fxgb': SelectFromModel(XGBClassifier(n_jobs=-1)), 'frf': SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)), 'fRFExgb': RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20), 'fRFErf': RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5), step=0.3, n_features_to_select=20), 'fRFElog': RFE(LogisticRegressionCV(penalty='l1', solver='saga', scoring='roc_auc'), step=0.3, n_features_to_select=20) } # Univariate feature selection feature_u = { 'fchi2': GenericUnivariateSelect(chi2, 'percentile', 25), 'fMutualclf': GenericUnivariateSelect(mutual_info_classif, 'percentile', 25), 'fFclf': GenericUnivariateSelect(f_classif, 'percentile', 25), } # sklearn estimator t = all_estimators(type_filter=['classifier']) estimator = {} for i in t: try: estimator.update({i[0]: i[1]()}) except Exception: continue estimator.update( dummy=DummyClassifier(), XGBClassifier=XGBClassifier(n_jobs=-1), LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'), EasyEnsembleClassifier=EasyEnsembleClassifier(), BalancedRandomForestClassifier=BalancedRandomForestClassifier(), RUSBoostClassifier=RUSBoostClassifier(), SVC=SVC(C=0.01, gamma='auto')) if pipe is None: feature_s = {} feature_s.update(**feature_m, **feature_u) return { 'clean': clean.keys(), 'encoding': encode.keys(), 'resample': resample.keys(), 'scale': scale.keys(), 'feature_c': feature_c.keys(), 'feature_s': feature_s.keys(), 'classifier': estimator.keys() } elif isinstance(pipe, str): l = pipe.split('_') all_keys_dict = {} all_keys_dict.update(**clean, **encode, **scale, **feature_c, **feature_m, **feature_u, **estimator, **resample) steps = [] for i in l: if all_keys_dict.get(i) is not None: steps.append((i, all_keys_dict.get(i))) else: raise KeyError( "'{}' invalid key for sklearn estimators".format(i)) return Pipeline(steps) else: raise ValueError("input pipe must be a string in format 'xx[_xx]'")
axs = [a for ax in axs for a in ax] for ax, sampling_strategy in zip(axs, (0, { 1: 25, 0: 10 }, { 1: 14, 0: 10 }, { 1: 10, 0: 10 })): if sampling_strategy == 0: c0, c1 = plot_resampling(ax, X_vis, y, 'Original set') else: iht = InstanceHardnessThreshold( sampling_strategy=sampling_strategy, estimator=LogisticRegression(solver='lbfgs', multi_class='auto'), return_indices=True) X_res, y_res, idx_res = iht.fit_resample(X, y) X_res_vis = pca.transform(X_res) plot_resampling( ax, X_res_vis, y_res, 'Instance Hardness Threshold ({})'.format(sampling_strategy)) # plot samples which have been removed idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_res) c3 = ax.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.2, label='Removed samples') plt.figlegend((c0, c1, c3), ('Class #0', 'Class #1', 'Removed samples'), loc='lower center',
def fscore(params_org): #print(params_org) parambk = copy.deepcopy(params_org) ifError =0 global best, HPOalg,params_best, errorcount params= params_org['classifier'] classifier = params.pop('name') p_random_state = params.pop('random_state') if (classifier == 'SVM'): param_value= params.pop('gamma_value') if(params['gamma'] == "value"): params['gamma'] = param_value else: pass clf = SVC(max_iter = 10000, cache_size= 700, random_state = p_random_state,**params) #max_iter=10000 and cache_size= 700 https://github.com/EpistasisLab/pennai/issues/223 #maxvalue https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L262 elif(classifier == 'RF'): clf = RandomForestClassifier(random_state = p_random_state, **params) elif(classifier == 'KNN'): p_value = params.pop('p') if(p_value==0): params['metric'] = "chebyshev" elif(p_value==1): params['metric'] = "manhattan" elif(p_value==2): params['metric'] = "euclidean" else: params['metric'] = "minkowski" params['p'] = p_value #https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L302 clf = KNeighborsClassifier(**params) elif(classifier == 'DTC'): clf = DecisionTreeClassifier(random_state = p_random_state, **params) elif(classifier == 'LR'): penalty_solver = params.pop('penalty_solver') params['penalty'] = penalty_solver.split("+")[0] params['solver'] = penalty_solver.split("+")[1] clf = LogisticRegression(random_state = p_random_state, **params) #resampling parameter p_sub_params= params_org.pop('sub') p_sub_type = p_sub_params.pop('type') sampler = p_sub_params.pop('smo_grp') gmean = [] if (p_sub_type == 'SMOTE'): smo = SMOTE(**p_sub_params) elif (p_sub_type == 'ADASYN'): smo = ADASYN(**p_sub_params) elif (p_sub_type == 'BorderlineSMOTE'): smo = BorderlineSMOTE(**p_sub_params) elif (p_sub_type == 'SVMSMOTE'): smo = SVMSMOTE(**p_sub_params) elif (p_sub_type == 'SMOTENC'): smo = SMOTENC(**p_sub_params) elif (p_sub_type == 'KMeansSMOTE'): smo = KMeansSMOTE(**p_sub_params) elif (p_sub_type == 'RandomOverSampler'): smo = RandomOverSampler(**p_sub_params) #Undersampling elif (p_sub_type == 'TomekLinks'): smo = TomekLinks(**p_sub_params) elif (p_sub_type == 'ClusterCentroids'): if(p_sub_params['estimator']=='KMeans'): p_sub_params['estimator']= KMeans(random_state = p_random_state) elif(p_sub_params['estimator']=='MiniBatchKMeans'): p_sub_params['estimator']= MiniBatchKMeans(random_state = p_random_state) smo = ClusterCentroids(**p_sub_params) elif (p_sub_type == 'RandomUnderSampler'): smo = RandomUnderSampler(**p_sub_params) elif (p_sub_type == 'NearMiss'): smo = NearMiss(**p_sub_params) elif (p_sub_type == 'InstanceHardnessThreshold'): if(p_sub_params['estimator']=='knn'): p_sub_params['estimator']= KNeighborsClassifier() elif(p_sub_params['estimator']=='decision-tree'): p_sub_params['estimator']=DecisionTreeClassifier() elif(p_sub_params['estimator']=='adaboost'): p_sub_params['estimator']=AdaBoostClassifier() elif(p_sub_params['estimator']=='gradient-boosting'): p_sub_params['estimator']=GradientBoostingClassifier() elif(p_sub_params['estimator']=='linear-svm'): p_sub_params['estimator']=CalibratedClassifierCV(LinearSVC()) elif(p_sub_params['estimator']=='random-forest'): p_sub_params['estimator']=RandomForestClassifier(n_estimators=100) smo = InstanceHardnessThreshold(**p_sub_params) elif (p_sub_type == 'CondensedNearestNeighbour'): smo = CondensedNearestNeighbour(**p_sub_params) elif (p_sub_type == 'EditedNearestNeighbours'): smo = EditedNearestNeighbours(**p_sub_params) elif (p_sub_type == 'RepeatedEditedNearestNeighbours'): smo = RepeatedEditedNearestNeighbours(**p_sub_params) elif (p_sub_type == 'AllKNN'): smo = AllKNN(**p_sub_params) elif (p_sub_type == 'NeighbourhoodCleaningRule'): smo = NeighbourhoodCleaningRule(**p_sub_params) elif (p_sub_type == 'OneSidedSelection'): smo = OneSidedSelection(**p_sub_params) #Combine elif (p_sub_type == 'SMOTEENN'): smo = SMOTEENN(**p_sub_params) elif (p_sub_type == 'SMOTETomek'): smo = SMOTETomek(**p_sub_params) e='' try: for train, test in cv.split(X, y): if(p_sub_type=='NO'): X_smo_train, y_smo_train = X[train], y[train] else: X_smo_train, y_smo_train = smo.fit_sample(X[train], y[train]) y_test_pred = clf.fit(X_smo_train, y_smo_train).predict(X[test]) gm = geometric_mean_score(y[test], y_test_pred, average='binary') gmean.append(gm) mean_g=np.mean(gmean) except Exception as eec: e=eec mean_g = 0 ifError =1 errorcount = errorcount+1 gm_loss = 1 - mean_g abc=time.time()-starttime if mean_g > best: best = mean_g params_best = copy.deepcopy(parambk) return {'loss': gm_loss, 'mean': mean_g, 'status': STATUS_OK, # -- store other results like this 'run_time': abc, 'iter': iid, 'current_best': best, 'eval_time': time.time(), 'SamplingGrp': sampler, 'SamplingType': p_sub_type, 'ifError': ifError, 'Error': e, 'params' : parambk, 'attachments': {'time_module': pickle.dumps(time.time)} }
return fbeta_score(y_true, y_pred, beta=2) #evaluate a model def evaluate_model(X, y, model): #define evaluation procedure cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) #define the model evaluation metric metric = make_scorer(f2_measure) #evaluate model scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1) return scores #define the location of the dataset full_path = 'german.csv' #load the dataset X, y, cat_ix, num_ix = load_dataset(full_path) #define model to evaluate model = LogisticRegression(solver='liblinear', class_weight='balanced') #define the data sampling sampling = InstanceHardnessThreshold() # one hot encode categorical, normalize numerical ct = ColumnTransformer([('c', OneHotEncoder(), cat_ix), ('n', MinMaxScaler(), num_ix)]) # scale, then sample, then fit model pipeline = Pipeline(steps=[('t', ct), ('s', sampling), ('m', model)]) #evaluate the model and store results scores = evaluate_model(X, y, pipeline) print('%.3f (%.3f)' % (mean(scores), std(scores)))
X4,y4=SVMSMOTE().fit_resample(X,y) X5,y5=KMeansSMOTE().fit_resample(X,y) X6,y6=SMOTEN().fit_resample(X,y) #X7,y7=SMOTENC().fit_resample(X,y) X8,y8=RandomOverSampler().fit_resample(X,y) #Algoritmos de Undersampling X9,y9=RandomUnderSampler().fit_resample(X,y) X10,y10=NearMiss().fit_resample(X,y) X11,y11=EditedNearestNeighbours().fit_resample(X,y) X12,y12=RepeatedEditedNearestNeighbours().fit_resample(X,y) X13,y13=AllKNN().fit_resample(X,y) #X14,y14=CondensedNearestNeighbour().fit_resample(X,y) X15,y15=OneSidedSelection().fit_resample(X,y) X16,y16=NeighbourhoodCleaningRule().fit_resample(X,y) X17,y17=InstanceHardnessThreshold().fit_resample(X,y) #Técnicas combinadas X18,y18=SMOTEENN().fit_resample(X,y) X19,y19=SMOTETomek().fit_resample(X,y) """Exemplo de reamostragem dos dados.""" fig, ax = plt.subplots(1,2, figsize=(20,5)) sns.countplot(x='Churn', data=pd.DataFrame(y), ax=ax[0]) sns.countplot(x='Churn', data = pd.DataFrame(y9), ax=ax[1]); """Separando os dados de treino e de teste.""" X_treino, X_teste, y_treino, y_teste = train_test_split(X,y, random_state=42) X_treino1, X_teste1, y_treino1, y_teste1 = train_test_split(X1,y1, random_state=42)
header=0, na_values='?') dataset.drop(dataset.columns[[26, 27]], axis=1, inplace=True) values = dataset.values X = values[:, 0:33] y = values[:, 33] labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) imputer = Imputer(strategy='median') X = imputer.fit_transform(X) iht = InstanceHardnessThreshold(random_state=12) X = X.astype(int) y = y.astype(int) X, y = iht.fit_sample(X, y) #print('Amount of each class after under-sampling: {0}'.format(Counter(y))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12) logreg = LogisticRegression() logreg.fit(X_train, y_train) logreg.predict(X_test) app = Flask(__name__)
def main(): print("Connecting to ML4H DB..") conn = pymysql.connect(host='nightmare.cs.uct.ac.za', port=3306, user='******', passwd='oesaerex', db='ochomo001') print("Connected") cur = conn.cursor() print("Executing SQL query..") print( "SQL script: select person_id,tot_consultations_attended,tot_consultations_missed, gender, age, city_village, profession, last_consultation_date, last_consultation_attendance,missed_last_appointment from ML4H_consultation_defaulter_sets" ) print("Retrieving all consultation features...") #SQL Query to pull in all consultation features and results from the Dataset cur.execute( "select person_id,tot_consultations_attended,tot_consultations_missed, gender, age, city_village, profession, last_consultation_date, last_consultation_attendance,missed_last_appointment from ML4H_consultation_defaulter_sets" ) print("Executed.") patient_ids = [] patients = {} consultation_features = [] consultation_results = [] occupations = [] locations = [] #["Consultations_Attended", "Consultations_Missed", "Sex", "Age", "Occupation", "Location","Day_of_week","Last_appointment" ] #load in all the consultation features into each patient and the result class for row in cur: if row[0] not in patient_ids: patient_ids.append(row[0]) patients[row[0]] = (Consultation_Patient(int(row[0]))) patients[row[0]].features[0] = int( row[1]) #Consultations Attended feature patients[row[0]].features[1] = int( row[2]) #Consultations Missed features #Patient Sex feature if row[3] == "M": patients[row[0]].features[2] = 0 elif row[3] == "F": patients[row[0]].features[2] = 1 patients[row[0]].features[3] = int(row[4]) #Patient Age feature patients[row[0]].features[5] = get_feature_index( row[5], locations) #Patient location feature patients[row[0]].features[4] = get_feature_index( row[6], occupations) #Patient Occupation feature patients[row[0]].features[6] = row[7].weekday( ) #Next appointment day of the week feature patients[row[0]].features[7] = int( row[8] ) #Whether a patient attended their last appointment feature consultation_features.append( patients[row[0]].features ) #Join above together and add to feataure list consultation_results.append( row[9] ) #Whether patient attended their last consultation RESULT SET cur.close() conn.close() print(len(consultation_results)) print(len(consultation_features)) #Split training set and hold out set X_train1, X_validation1, Y_train1, Y_validation1 = model_selection.train_test_split( consultation_features, consultation_results, test_size=0.3, random_state=7) #Find out balances of the training sets print("Y_train") check_result_distr(Y_train1) print("Y_val") check_result_distr(Y_validation1) # DATA IS IMBALANCED # Trying to balance data appropriately - Using multiple sampler tools to see which is best samplers = [['ALLKNN', AllKNN()], ['NearMiss', NearMiss()], ['CondensedNearestNeighbour', CondensedNearestNeighbour()], ['TomekLinks', TomekLinks()], ['NeighbourhoodCleaningRule', NeighbourhoodCleaningRule()], ['InstanceHardnessThreshold', InstanceHardnessThreshold()], ['RandomUnderSampler', RandomUnderSampler()]] #Write results of AllKNN results(best sampler) to file f1 = open('consultation_technique_comparison.csv', 'w') X_resamp, Y_resamp = samplers[0][1].fit_sample(X_train1, Y_train1) X_resamp_orig, Y_resamp_orig = samplers[0][1].fit_sample( consultation_features, consultation_results) results_final = apply_machine_learning_techniques(X_resamp_orig, Y_resamp_orig, X_resamp, Y_resamp, X_validation1, Y_validation1) f1.write(',Logistic Regression' + ", " 'K Neighbours Classifier' + "," + 'Decision Tree Classifier' + "," + 'Gaussian NB' + "," + 'Random Forrest' + "," + 'MLPClassifier' + "," + 'AdaBoostClassifier' + "," + 'Support Vector Machine') f1.write("\n") f1.write("Roc " + results_final[0] + "\n") f1.write("Sensitivity " + results_final[1] + "\n") f1.write("Specificity " + results_final[2] + "\n") f1.write("Unseen Roc " + results_final[3]) f1.close() #Write results of the original balanced dataset f = open('consultation_balance_comparison.csv', 'w') f.write("Sampler,Attended ,Missed ," + 'Logistic Regression' + ", " 'K Neighbours Classifier' + "," + 'Decision Tree Classifier' + "," + 'Gaussian NB' + "," + 'Random Forrest' + "," + 'MLPClassifier' + "," + 'AdaBoostClassifier' + "," + 'Support Vector Machine') f.write("\n") orig_distribution = check_result_distr(Y_train1) orig_results = apply_machine_learning_techniques(X_train1, Y_train1, X_validation1, Y_validation1, X_resamp_orig, Y_resamp_orig)[0] f.write("Orig" + "," + orig_distribution + orig_results + "\n") #Write results of all other sampler balancing technique results for sampler in samplers: print(sampler[0]) X_resamp, Y_resamp = sampler[1].fit_sample(X_train1, Y_train1) distribution = check_result_distr(Y_resamp) results = apply_machine_learning_techniques(X_resamp, Y_resamp, X_validation1, Y_validation1, X_resamp_orig, Y_resamp_orig)[0] f.write(sampler[0] + "," + distribution + results + "\n") f.close()
def Sampling(X, y, method): """ function to sample imbalanced dataset: Arguments: X -- trainset features y -- trainset labels method -- sampling method Return: X_res -- sampled trainset features y_res -- sampled trainset labels """ #Under-sampling: if method == 'RandomUnderSampler': from imblearn.under_sampling import RandomUnderSampler us = RandomUnderSampler() X_res, y_res = us.fit_resample(X, y) elif method == 'TomekLinks': from imblearn.under_sampling import TomekLinks us = TomekLinks() X_res, y_res = us.fit_resample(X, y) elif method == 'OneSidedSelection': from imblearn.under_sampling import OneSidedSelection us = OneSidedSelection() X_res, y_res = us.fit_resample(X, y) elif method == 'NeighbourhoodCleaningRule': from imblearn.under_sampling import NeighbourhoodCleaningRule us = NeighbourhoodCleaningRule() X_res, y_res = us.fit_resample(X, y) elif method == 'NearMiss': from imblearn.under_sampling import NearMiss us = NearMiss() X_res, y_res = us.fit_resample(X, y) elif method == 'InstanceHardnessThreshold': from imblearn.under_sampling import InstanceHardnessThreshold us = InstanceHardnessThreshold() X_res, y_res = us.fit_resample(X, y) elif method == 'AllKNN': from imblearn.under_sampling import AllKNN us = AllKNN() X_res, y_res = us.fit_resample(X, y) elif method == 'RepeatedEditedNearestNeighbours': from imblearn.under_sampling import RepeatedEditedNearestNeighbours us = RepeatedEditedNearestNeighbours() X_res, y_res = us.fit_resample(X, y) elif method == 'EditedNearestNeighbours': from imblearn.under_sampling import EditedNearestNeighbours us = EditedNearestNeighbours() X_res, y_res = us.fit_resample(X, y) elif method == 'CondensedNearestNeighbour': from imblearn.under_sampling import CondensedNearestNeighbour us = CondensedNearestNeighbour() X_res, y_res = us.fit_resample(X, y) # Combination of over- and under-sampling: elif method == 'SMOTEENN': from imblearn.combine import SMOTEENN us = SMOTEENN() X_res, y_res = us.fit_resample(X, y) elif method == 'SMOTETomek': from imblearn.combine import SMOTETomek us = SMOTETomek() X_res, y_res = us.fit_resample(X, y) return X_res, y_res
X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax.set_title('Original set') else: iht = InstanceHardnessThreshold(ratio=ratio) X_res, y_res = iht.fit_sample(X, y) X_res_vis = pca.transform(X_res) ax.scatter(X_res_vis[y_res == 0, 0], X_res_vis[y_res == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax.scatter(X_res_vis[y_res == 1, 0], X_res_vis[y_res == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black,
选择一个多数类样本(需要下采样)加入集合C,其他的这类样本放入集合S; 使用集合S训练一个1-NN的分类器,对集合S中的样本进行分类; 将集合S中错分的样本加入集合C; 重复上述过程, 直到没有样本再加入到集合C. ''' from imblearn.under_sampling import CondensedNearestNeighbour cnn = CondensedNearestNeighbour(random_state=0) X_resampled, y_resampled = cnn.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #显然,CondensedNearestNeighbour方法对噪音数据是很敏感的,也容易加入噪音数据到集合C中. #因此,OneSidedSelection函数使用 TomekLinks方法来剔除噪声数据(多数类样本). from imblearn.under_sampling import OneSidedSelection oss = OneSidedSelection(random_state=0) X_resampled, y_resampled = oss.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) ''' NeighbourhoodCleaningRule 算法主要关注如何清洗数据而不是筛选(considering)他们. 因此,该算法将使用 EditedNearestNeighbours和 3-NN分类器结果拒绝的样本之间的并集. ''' from imblearn.under_sampling import NeighbourhoodCleaningRule ncr = NeighbourhoodCleaningRule(random_state=0) X_resampled, y_resampled = ncr.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) #InstanceHardnessThreshold是一种很特殊的方法,是在数据上运用一种分类器,然后将概率低于阈值的样本剔除掉. from sklearn.linear_model import LogisticRegression from imblearn.under_sampling import InstanceHardnessThreshold iht = InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression()) X_resampled, y_resampled = iht.fit_sample(X, y) print(sorted(Counter(y_resampled).items()))
from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier imbalances = [ RandomUnderSampler(), TomekLinks(), ClusterCentroids(), NearMiss(version=1), NearMiss(version=2), NearMiss(version=3), CondensedNearestNeighbour(size_ngh=3, n_seeds_S=51), OneSidedSelection(size_ngh=5, n_seeds_S=51), InstanceHardnessThreshold(), RandomOverSampler(ratio='auto'), SMOTE(ratio='auto', kind='regular'), SMOTE(ratio='auto', kind='borderline1'), SMOTE(ratio='auto', kind='borderline2'), SMOTETomek(ratio='auto'), SMOTEENN(ratio='auto') ] classifiers = [ LogisticRegression(), SVC(probability=True), DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier(n_neighbors=5) ]
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False self.sampler = InstanceHardnessThreshold(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.product(), bug_features.component(), bug_features.is_mozillian(), bug_features.bug_reporter(), bug_features.blocked_bugs_number(), bug_features.priority(), bug_features.has_cve_in_alias(), bug_features.comment_count(), bug_features.comment_length(), bug_features.reporter_experience(), bug_features.number_of_bug_dependencies(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, ), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
X_resampled = pd.DataFrame(X_resampled) X_resampled.columns = [ 'is_static', 'is_enum', 'uses_variables', 'call_method', 'is_interface', 'is_local_class', 'call_external_method' ] y_resampled = pd.DataFrame(y_resampled) y_resampled.columns = ['is_code_smell'] undersampled_data = pd.concat([X_resampled, y_resampled], axis=1) print("TomekLinks") print(undersampled_data.describe()) undersampled_data.to_csv('../../dataset/LIC/LIC_TomekLinks.csv', index=False) # # #InstanceHardnessThreshold non efficace il a garder les memes instances rus = InstanceHardnessThreshold(return_indices=True) X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, Y) X_resampled = pd.DataFrame(X_resampled) X_resampled.columns = [ 'is_static', 'is_enum', 'uses_variables', 'call_method', 'is_interface', 'is_local_class', 'call_external_method' ] y_resampled = pd.DataFrame(y_resampled) y_resampled.columns = ['is_code_smell'] undersampled_data = pd.concat([X_resampled, y_resampled], axis=1) print("InstanceHardnessThreshold") print(undersampled_data.describe()) undersampled_data.to_csv('../../dataset/LIC/LIC_InstanceHardnessThreshold.csv', index=False) #NearMiss
def test_iht_fit_resample_class_obj(): est = GradientBoostingClassifier(random_state=RND_SEED) iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (12, 2) assert y_resampled.shape == (12, )
plot_resampling(X, y, sampler, ax[1]) ax[1].set_title(f"Resampling using {sampler.__class__.__name__}") fig.tight_layout() ############################################################################### # ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude # samples. All samples which are classified with a low probability will be # removed. fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title(f"Linear SVC with y={Counter(y)}") sampler = InstanceHardnessThreshold( random_state=0, estimator=LogisticRegression(solver="lbfgs", multi_class="auto"), ) clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax2) ax2.set_title(f"Decision function for {sampler.__class__.__name__}") plot_resampling(X, y, sampler, ax3) ax3.set_title(f"Resampling using {sampler.__class__.__name__}") fig.tight_layout() plt.show()