def train(classifier, df,y, user_id): ''' The main training function that runs on a seperate process''' X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=0) base_estimator = AdaBoostClassifier(n_estimators=10) rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost))) cm_rusboost = confusion_matrix(y_test, y_pred_rusboost) joblib.dump(rusboost, user_id+'.pkl') classifier.classifierStatus = "trained" print("Done training") return classifier
def get_best_parameters(self, features, labels, base_estimator=None, n_iter=300, cv=3, verbose=1, random_state=1, n_jobs=-1): clf_random =\ GridSearchCV( estimator=RUSBoostClassifier(), param_grid=self.random_grid, cv=cv, verbose=verbose, n_jobs=n_jobs, iid=False, error_score=0 ) _features = features if 1 == len(features.values.shape): # imbalanced learn RUSBoostClassifier # doesn't like shapes of (N=1,) ? _features = features.values.reshape(-1, 1) clf_random.fit(_features, labels) return clf_random.best_params_
def train(X_train, y_train, method_name, base_classifier, T): if method_name == 'adaboost': clf = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=T) elif method_name == 'RUSBoost': clf = RUSBoostClassifier(base_estimator=base_classifier, n_estimators=T, sampling_strategy='majority') elif method_name == 'SMOTEBoost': clf = OversampleBoost(oversampling_algorithm='SMOTE', base_estimator=base_classifier, n_estimators=T) elif method_name == 'SMOTETomekBoost': clf = OversampleBoost(oversampling_algorithm='SMOTE-TOMEK', base_estimator=base_classifier, n_estimators=T) elif method_name == 'SMOTEENNBoost': clf = OversampleBoost(oversampling_algorithm='SMOTE-ENN', base_estimator=base_classifier, n_estimators=T) elif method_name == 'DERSBoost': clf = DERSBoost(base_estimator=base_classifier, n_estimators=T, NGEN=50) start_time = time() clf.fit(X_train, y_train) elapsed_time = time() - start_time return clf, elapsed_time
def test_rusboost_sample_weight(imbalanced_dataset, algorithm): X, y = imbalanced_dataset sample_weight = np.ones_like(y) rusboost = RUSBoostClassifier(algorithm=algorithm, random_state=0) # Predictions should be the same when sample_weight are all ones y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) y_pred_no_sample_weight = rusboost.fit(X, y).predict(X) assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight) rng = np.random.RandomState(42) sample_weight = rng.rand(y.shape[0]) y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) with pytest.raises(AssertionError): assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight)
def fit(self, X, Y, sample_weight=None): import sklearn.tree self.n_estimators = int(self.n_estimators) self.learning_rate = float(self.learning_rate) self.max_depth = int(self.max_depth) base_estimator = sklearn.tree.DecisionTreeClassifier( max_depth=self.max_depth) from imblearn.ensemble import RUSBoostClassifier estimator = RUSBoostClassifier(base_estimator=base_estimator, n_estimators=self.n_estimators, learning_rate=self.learning_rate, algorithm=self.algorithm, random_state=self.random_state) estimator.fit(X, Y, sample_weight=sample_weight) self.estimator = estimator return self
def test_rusboost(imbalanced_dataset, algorithm): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1) classes = np.unique(y) n_estimators = 500 rusboost = RUSBoostClassifier(n_estimators=n_estimators, algorithm=algorithm, random_state=0) rusboost.fit(X_train, y_train) assert_array_equal(classes, rusboost.classes_) # check that we have an ensemble of samplers and estimators with a # consistent size assert len(rusboost.estimators_) > 1 assert len(rusboost.estimators_) == len(rusboost.samplers_) assert len(rusboost.pipelines_) == len(rusboost.samplers_) # each sampler in the ensemble should have different random state assert (len({sampler.random_state for sampler in rusboost.samplers_ }) == len(rusboost.samplers_)) # each estimator in the ensemble should have different random state assert (len({est.random_state for est in rusboost.estimators_ }) == len(rusboost.estimators_)) # check the consistency of the feature importances assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1] # check the consistency of the prediction outpus y_pred = rusboost.predict_proba(X_test) assert y_pred.shape[1] == len(classes) assert rusboost.decision_function(X_test).shape[1] == len(classes) score = rusboost.score(X_test, y_test) assert score > 0.7, "Failed with algorithm {} and score {}".format( algorithm, score) y_pred = rusboost.predict(X_test) assert y_pred.shape == y_test.shape
def get_models(self): base_lr = LogisticRegression(class_weight='balanced') ovr_lr = OneVsRestClassifier(base_lr) base_eec = EasyEnsembleClassifier(n_estimators=10) ovr_eec = OneVsRestClassifier(base_eec) base_rus = RUSBoostClassifier(n_estimators=50) ovr_rus = OneVsRestClassifier(base_rus) base_bbc = BalancedBaggingClassifier(n_estimators=10) ovr_bbc = OneVsRestClassifier(base_bbc) base_brf = BalancedRandomForestClassifier(n_estimators=100) ovr_brf = OneVsRestClassifier(base_brf) estimators = [('lr', ovr_lr), ('eec', ovr_eec), ('rus', ovr_rus), ('bbc', ovr_bbc), ('brf', ovr_brf)] return estimators
def __init__(self): from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \ TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \ CondensedNearestNeighbour, NeighbourhoodCleaningRule from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \ BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier self.oversamplers = { 'ADASYN': ADASYN(), 'RandomOverSampler': RandomOverSampler(), 'SMOTE': SMOTE(), 'BorderlineSMOTE': BorderlineSMOTE(), 'SVMSMOTE': SVMSMOTE() } self.undersamplers = { 'ClusterCentroids': ClusterCentroids(), 'RandomUnderSampler': RandomUnderSampler(), 'InstanceHardnessThreshold': InstanceHardnessThreshold(), 'NearMiss': NearMiss(), 'TomekLinks': TomekLinks(), 'EditedNearestNeighbours': EditedNearestNeighbours(), 'RepeatedEditedNearestNeighbours': RepeatedEditedNearestNeighbours(), 'AllKNN': AllKNN(), 'OneSidedSelection': OneSidedSelection(), 'CondensedNearestNeighbour': CondensedNearestNeighbour(), 'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule() } self.ensemblesamplers = { 'EasyEnsemble': EasyEnsemble(), 'EasyEnsembleClassifier': EasyEnsembleClassifier(), 'BalancedBaggingClassifier': BalancedBaggingClassifier(), 'BalanceCascade': BalanceCascade(), 'BalancedRandomForestClassifier': BalancedRandomForestClassifier, 'RUSBoostClassifier': RUSBoostClassifier() }
def test_rusboost(imbalanced_dataset, algorithm): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) classes = np.unique(y) n_estimators = 500 rusboost = RUSBoostClassifier(n_estimators=n_estimators, algorithm=algorithm, random_state=0) rusboost.fit(X_train, y_train) assert_array_equal(classes, rusboost.classes_) # check that we have an ensemble of samplers and estimators with a # consistent size assert len(rusboost.estimators_) > 1 assert len(rusboost.estimators_) == len(rusboost.samplers_) assert len(rusboost.pipelines_) == len(rusboost.samplers_) # each sampler in the ensemble should have different random state assert (len(set(sampler.random_state for sampler in rusboost.samplers_)) == len(rusboost.samplers_)) # each estimator in the ensemble should have different random state assert (len(set(est.random_state for est in rusboost.estimators_)) == len(rusboost.estimators_)) # check the consistency of the feature importances assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1] # check the consistency of the prediction outpus y_pred = rusboost.predict_proba(X_test) assert y_pred.shape[1] == len(classes) assert rusboost.decision_function(X_test).shape[1] == len(classes) score = rusboost.score(X_test, y_test) assert score > 0.7, "Failed with algorithm {} and score {}".format( algorithm, score) y_pred = rusboost.predict(X_test) assert y_pred.shape == y_test.shape
def get_models(): models, names = list(), list() # LR models.append( LogisticRegression(solver='liblinear', class_weight='balanced', penalty='l2')) names.append('Logistic Regression') # Ada Boost names.append('Ada Boost') models.append(AdaBoostClassifier()) # Gradient Boosting names.append('Gradient Boosting') models.append(GradientBoostingClassifier()) # RUSBoostClassifier names.append('RUSBoost Classifier') models.append(RUSBoostClassifier()) # BalancedRandomForestClassifier names.append('RandomForestClassifier') models.append(RandomForestClassifier(class_weight='balanced')) # BalancedRandomForestClassifier names.append('EasyEnsembleClassifier') models.append(EasyEnsembleClassifier()) return models, names
def test_balanced_random_forest_error(imbalanced_dataset, boosting_params, err_msg): rusboost = RUSBoostClassifier(**boosting_params) with pytest.raises(ValueError, message=err_msg): rusboost.fit(*imbalanced_dataset)
def _init_classifier(self, opt): if "classifier_opt" in opt: opt = opt['classifier_opt'] if "base_estimator" in opt: b_est = self._init_classifier(opt["base_estimator"]) else: b_est = None if "n_estimators" in opt: n_estimators = opt["n_estimators"] else: n_estimators = 200 if "max_iter" in opt: max_iter = opt["max_iter"] else: max_iter = 100000 if "num_parallel_tree" in opt: num_parallel_tree = opt["num_parallel_tree"] else: num_parallel_tree = 5 if "layer_structure" in opt: layer_structure = opt["layer_structure"] else: layer_structure = (100,) if opt["type"] in ["random_forrest", "rf"]: return RandomForestClassifier(n_estimators=n_estimators, class_weight="balanced", n_jobs=-1) elif opt["type"] == "ada_boost": return AdaBoostClassifier(base_estimator=b_est, n_estimators=n_estimators) elif opt["type"] in ["logistic_regression", "lr"]: return LogisticRegression(class_weight='balanced', max_iter=max_iter) elif opt["type"] == "sgd": return SGDClassifier(class_weight='balanced', max_iter=max_iter) elif opt["type"] in ["gaussian_bayes", "bayes", "gaussian_nb"]: return GaussianNB() elif opt["type"] in ["support_vector_machine", "svm"]: return SVC(kernel='rbf', class_weight='balanced', gamma="scale") elif opt["type"] in ["multilayer_perceptron", "mlp"]: return MLPClassifier(hidden_layer_sizes=layer_structure, max_iter=max_iter) elif opt["type"] in ["decision_tree", "dt", "tree"]: return DecisionTreeClassifier() elif opt["type"] in ["b_decision_tree", "b_dt", "b_tree"]: return DecisionTreeClassifier(class_weight="balanced") elif opt["type"] in ["neighbours", "knn"]: return KNeighborsClassifier(n_neighbors=opt["n_neighbours"]) elif opt["type"] == "extra_tree": return ExtraTreesClassifier(n_estimators=n_estimators, class_weight="balanced", n_jobs=-1) elif opt["type"] == "xgboost": return XGBClassifier(objective='binary:logistic', n_estimators=n_estimators, num_parallel_tree=num_parallel_tree, tree_method="hist", booster="gbtree", n_jobs=-1) elif opt["type"] in ["b_random_forrest", "b_rf"]: return BalancedRandomForestClassifier(n_estimators=n_estimators, n_jobs=-1) elif opt["type"] == "b_bagging": return BalancedBaggingClassifier(base_estimator=b_est, n_estimators=n_estimators) elif opt["type"] == "b_boosting": return RUSBoostClassifier(base_estimator=b_est, n_estimators=n_estimators) else: raise ValueError("type: {} not recognised".format(opt["type"]))
# achieve worse performance. base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) print('Easy ensemble classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_eec), geometric_mean_score(y_test, y_pred_eec))) cm_eec = confusion_matrix(y_test, y_pred_eec) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0], title='Easy ensemble classifier') rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('RUSBoost classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost))) cm_rusboost = confusion_matrix(y_test, y_pred_rusboost) plot_confusion_matrix(cm_rusboost, classes=np.unique(satimage.target), ax=ax[1], title='RUSBoost classifier') plt.show()
def nosampling_pipeline(data=[], verbose=False, clean=False, plot=False): results_table = [] results = [] rand_state = 42 if clean: X = data.drop('Class', axis=1) y = data['Class'] X_vals = X.values y_vals = y.values X_inliners, y_inliners = reject_sampler.fit_resample(X_vals, y_vals) X = X_inliners y = y_inliners else: X = data.drop('Class', axis=1) y = data['Class'] X = X.values y = y.values pass sss = StratifiedKFold(n_splits=10, random_state=rand_state, shuffle=False) print("StratKFold:", sss) #List of models to be used models = [ DecisionTreeClassifier(random_state=rand_state), RUSBoostClassifier(random_state=rand_state), LogisticRegression(random_state=rand_state), BalancedBaggingClassifier(random_state=rand_state), RandomForestClassifier(random_state=rand_state), EasyEnsembleClassifier( base_estimator=RandomForestClassifier(random_state=rand_state), random_state=rand_state), BalancedRandomForestClassifier(random_state=rand_state) ] results_table = pd.DataFrame(columns=['models', 'fpr', 'tpr', 'auc']) #Create training and testing data sets depending on wheather or not they have been generated previously. #Instantiate lists to store each of the models results strategy = [] classifier = [] strategy = [] samp_technique = [] accuracy = [] f1 = [] auc = [] recall = [] precision = [] g_mean = [] start = time.time() #Run thorugh each of the models to get their performance metrics sampling_strat = 'no_sampling' for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # X_train=X_train.values # X_test=X_test.values # y_train=y_train.values # y_test=y_test.values for model in models: print( "Using lentgh of X for training: {}; Using Length of Y for training: {}" .format(len(X_train), len(y_train))) print( "Using lentgh of X for testing: {}; Using Length of Y for test: {}" .format(len(X_test), len(y_test))) print("Currently training model - {} using sampling strategy - {}". format(model.__class__.__name__, sampling_strat)) print("--" * 20) clf = model pipe = make_pipeline(clf) # LOG_REG_MODEL WITH BOTHER pipe.fit(X_train, y_train) test_preds = pipe.predict(X_test) #yproba = pipe.predict_proba(X_test)[::,1] classifier.append(model.__class__.__name__) samp_technique.append(sampling_strat) strategy.append(" %s+%s " % (str(model.__class__.__name__), sampling_strat)) f1.append(f1_score(y_test, test_preds)) accuracy.append(accuracy_score(y_test, test_preds)) auc.append(roc_auc_score(y_test, test_preds)) recall.append(recall_score(y_test, test_preds)) precision.append(precision_score(y_test, test_preds)) g_mean.append( geometric_mean_score(y_test, test_preds, average='binary')) fpr, tpr, _ = roc_curve(y_test, test_preds) auc_score = roc_auc_score(y_test, test_preds) results_table = results_table.append( { 'classifiers': model.__class__.__name__, 'fpr': fpr, 'tpr': tpr, 'auc_score': auc_score }, ignore_index=True) #Print the model and its report if verbose: print('Classification Model: ', model.__class__.__name__, '\n') print('Sampling Strategy Model: ', sampling_strat, '\n') print(confusion_matrix(y_test, test_preds), '\n') print(classification_report_imbalanced(y_test, test_preds), '\n') #round the results for convenience f1 = [float(round(n, 4)) for n in f1] auc = [float(round(n, 4)) for n in auc] g_mean = [float(round(n, 4)) for n in g_mean] accuracy = [float(round(n, 4)) for n in accuracy] precision = [float(round(n, 4)) for n in precision] recall = [float(round(n, 4)) for n in recall] #store results in dataframe results = pd.DataFrame( [ classifier, strategy, samp_technique, f1, auc, g_mean, accuracy, precision, recall ], index=[ 'classifier', 'strategy', 'samp_technique', 'f1', 'roc_auc', 'g_mean', 'accuracy', 'precision', 'recall' ], columns=[ 'DecisionTreeClassifier', 'RUSBoostClaassifier', 'LogisiticRegression', 'BalancedBaggingClassifier', 'RandomForestClassifier', 'EasyEnsembleClassifier', 'BalancedRandomForestClassifier' ]) if plot: results_table.set_index('classifiers', inplace=True) fig = plt.figure(figsize=(8, 6)) results_table.sort_values(by=['auc_score'], ascending=False) for i in results_table.index: plt.plot(results_table.loc[i]['fpr'], results_table.loc[i]['tpr'], label="{}, AUC={:.4f}".format( i, results_table.loc[i]['auc_score'])) plt.plot([0, 1], [0, 1], color='orange', linestyle='--') plt.xticks(np.arange(0.0, 1.1, step=0.1)) plt.xlabel("Flase Positive Rate", fontsize=15) plt.yticks(np.arange(0.0, 1.1, step=0.1)) plt.ylabel("True Positive Rate", fontsize=15) plt.title( 'ROC Curve for classifiers using Full data split using sampling technique: {}' .format(sampling_strat), fontweight='bold', fontsize=15) plt.legend(prop={'size': 13}, loc='lower right') plt.show() #Change orientation of the dataframe end = time.time() print("Time elapsed:", start - end) return results.transpose()
df = pd.read_csv('data/poker-8-9_vs_5.csv') X, y, z = prepare_data(df) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.7) kf = StratifiedKFold(n_splits=10) kf.get_n_splits(X, y) bbc = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(random_state=0), random_state=42) brfc = BalancedRandomForestClassifier(max_depth=2, random_state=0) eec = EasyEnsembleClassifier( base_estimator=DecisionTreeClassifier(random_state=0), random_state=42) rbc = RUSBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=0), random_state=0) bbc_score = [] brfc_score = [] eec_score = [] rbc_score = [] for train_index, test_index in kf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] bbc.fit(X_train, y_train) brfc.fit(X_train, y_train) eec.fit(X_train, y_train) rbc.fit(X_train, y_train) y_pred_bbc = bbc.predict(X_test) y_pred_brfc = brfc.predict(X_test)
DecisionTreeClassifier(random_state=r), KNeighborsClassifier(), GaussianNB(), MultinomialNB(), LogisticRegression(random_state=r), SVC(random_state=r, kernel='sigmoid'), MLPClassifier(random_state=r), BaggingClassifier(random_state=r), RandomForestClassifier(random_state=r), GradientBoostingClassifier(random_state=r), LGBMClassifier(), XGBClassifier(random_state=r), CatBoostClassifier(random_state=r, verbose=False), BalancedBaggingClassifier(random_state=r), BalancedRandomForestClassifier(random_state=r), RUSBoostClassifier(random_state=r) ] names = [ "DecisionTree", "KNeighbors", "GaussianNB", "MultinomialNB", "LogisticRegression", "SVC", "MLPClassifier", "Ensemble-Bagging", "Ensemble-RandomForest", "Ensemble-GradientBoosting", "LightGradientBoosting", "XGBoost", "CatBoost", "BalancedBagging", "BalancedRandomForest", "RUSBoost" ] outputs = {} for name, model in zip(names, models): model.fit(x_train, y_train) output = model.predict(test_dataframe) outputs[name] = output
# initialize cv5 skf = StratifiedKFold(n_splits=5) cv5_ids = list(skf.split(full_data, labels)) # print(cv5_ids) # initialize model # lin_clf = svm.SVC(decision_function_shape='ovo', probability=True) # lin_clf = svm.LinearSVC() # lin_clf = LogisticRegression() # lin_clf = svm.SVC(kernel='sigmoid') # lin_clf = MLPClassifier((256,256), activation='relu', max_iter=1000) # lin_clf = RandomForestClassifier(n_estimators=5000, max_depth=2, random_state=0) single_clf = tree.DecisionTreeClassifier(max_depth=1) # single_clf = LogisticRegression() lin_clf = RUSBoostClassifier(base_estimator=single_clf, n_estimators=5000) # initialize booster sm = SMOTE(random_state=42) # perform cv5 precision_avg = [] recall_avg = [] fscore_avg = [] acc_avg = 0. for sp in cv5_ids: train_data, train_labels = full_data[sp[0]], labels[sp[0]] # train_data, train_labels = sm.fit_sample(train_data, train_labels) test_data, test_labels = full_data[sp[1]], labels[sp[1]] lin_clf.fit(train_data, train_labels)
def learning_model(year, class_weight): iters = 300 gap = 2 year_test = year data_test = reader.ordinary_data_reader('uscecchini28.csv', year_test, year_test) x_test = data_test.features y_test = data_test.labels test = np.c_[data_test.years, data_test.firms] ''' an if-else is used to judge whether the class_weight is None to prevent Exception from string concatenation a try-except for RusBoost with DecisionTreeClassifier using custom class_weight if we can find the right model trained last time on disk, we can directly use that model to predict the result without training twice otherwise, we have to train that model and save it on disk ''' # if class_weight is not None: # we use current_model_name to find/save the trained model with custom class_weight # current_model_name = class_weight + "_" + str(year_test) + ".m" # else: # current_model_name = str(year_test) + ".m" current_model_name = class_weight + "_" + str(year_test) + ".m" try: rusboost_model = joblib.load(current_model_name) except Exception as e: print('Running RUSBoost (training period: 1991-' + str(year_test - gap) + ', testing period: ' + str( year_test) + ', with ' + str(gap) + '-year gap)...') data_train = reader.ordinary_data_reader('uscecchini28.csv', 1991, year_test - gap) x_train = data_train.features y_train = data_train.labels newpaaer_train = data_train.newpaaers # formatter labels and newpaaers for the step: data_test.newpaaers(data_test.labels~=0) data_test.newpaaers = np.array(data_test.newpaaers) data_test.labels = np.array(data_test.labels) # replace the nan that should be remained in the array with 0 for i in range(len(data_test.newpaaers)): if np.isnan(data_test.newpaaers[i]): if data_test.labels[i] != 0: data_test.newpaaers[i] = 0 # replace all the nans remain in the array data_test.newpaaers = np.array([x for x in data_test.newpaaers if str(x) != 'nan']) # replace all the 0 back to nan for i in range(len(data_test.newpaaers)): if int(data_test.newpaaers[i]) == 0.0: data_test.newpaaers[i] = np.NaN # do the unique to get final result for newpaaer_test newpaaer_test = np.unique(data_test.newpaaers) ''' Caution: here we change the type of variable called y_train for matching the array index of formatted array newpaaer_train in the following loop ''' y_train = np.array(y_train) num_frauds = sum(y_train == 1) print(num_frauds) ''' here we use the function in1d to replace the function ismember used in matlab and a temp array for the other operation to handle serial frauds finish the step: y_train[ismember(newpaaer_train, newpaaer_test)] = 0 ''' temp_array = np.array(np.in1d(newpaaer_train, newpaaer_test)).astype(int) for i in range(len(temp_array)): if temp_array[i] == 1: y_train[i] = 0 # delete the temp array del temp_array num_frauds = num_frauds - sum(y_train == 1) print('Recode', num_frauds, 'overlapped frauds (i.e., change fraud label from 1 to 0).') start_time = time.perf_counter() rusboost_model = RUSBoostClassifier(DecisionTreeClassifier(min_samples_leaf=5, class_weight=class_weight), learning_rate=0.1, n_estimators=iters) rusboost_model.fit(x_train, y_train) end_time = time.perf_counter() t_train = end_time - start_time joblib.dump(rusboost_model, current_model_name) print(end_time - start_time) print('Training time: %.3f seconds' % t_train) start_time = time.perf_counter() predit = rusboost_model.predict(x_test) prob = rusboost_model.predict_proba(x_test) end_time = time.perf_counter() t_test = end_time - start_time print('Testing time %.3f seconds' % t_test) # test figures print("AUC: %.4f" % metrics.roc_auc_score(y_test, predit)) # np.set_printoptions(precision=4, threshold=8, edgeitems=4, linewidth=75, suppress=True, nanstr='nan', infstr='inf') print("precision: %.2f%%" % np.multiply(metrics.precision_score(y_test, predit, zero_division=0), 100)) print("recall: %.2f%%" % np.multiply(metrics.recall_score(y_test, predit), 100)) # dump part of the results(fraud probability) prob = np.around(np.delete(prob, 0, axis=1) * 100, decimals=5) data = np.c_[predit, prob] data = np.c_[test, data] file_data = pd.DataFrame(data) csv_file_name = 'data.csv' file_data.to_csv(csv_file_name, header=False, index=False)
def Gridsearchcv(X_train, X_test, y_train, y_test): ############ # Scale numeric values num_transformer = Pipeline(steps=[ ('scaler', MinMaxScaler())]) preprocessor = ColumnTransformer( remainder='passthrough', transformers=[ ('num', num_transformer, make_column_selector(pattern='EDAD')) ]) ############ pipe = Pipeline([ ('preprocessor', preprocessor), ('clf', PipelineHelper([ ('svc', SVC()), ('gb', GradientBoostingClassifier()), ('xgb', XGBClassifier(use_label_encoder=False)), ('eec', EasyEnsembleClassifier()), ('rbc', RUSBoostClassifier()), ('bbc', BalancedBaggingClassifier()), ('brf', BalancedRandomForestClassifier()), ])), ]) params = { 'clf__selected_model': pipe.named_steps['clf'].generate({ # # #EasyEnsembleClassifier 'eec__n_estimators' : [10, 25, 50, 100], 'eec__warm_start' : [False, True], 'eec__replacement' : [False, True], # # #RUSBoostClassifier 'rbc__algorithm' : ['SAMME','SAMME.R'], 'rbc__n_estimators' : [10, 50, 100, 200, 500], 'rbc__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.], # # #BalancedBaggingClassifier 'bbc__base_estimator': [HistGradientBoostingClassifier(), None], 'bbc__n_estimators' : [10, 50, 100, 200, 500,750,1000], 'bbc__max_samples':[0.5,0.6,0.7,0.8,0.9,1.0], 'bbc__max_features':[0.5,0.6,0.7,0.8,0.9,1.0], # #BalancedRandomForestClassifier 'brf__criterion': ['gini', 'entropy'], 'brf__n_estimators' : [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)], 'brf__max_depth' : [int(x) for x in np.linspace(1, 45, num = 3)], 'brf__min_samples_split' : range(2,10), 'brf__min_samples_leaf': [1,3,5,10], 'brf__max_features' : ['auto', 'sqrt', 'log2'], # # #svm 'svc__C': [0.1, 0.5, 1, 10, 30, 40, 50, 75, 100, 500, 1000], 'svc__gamma' : [0.0001, 0.001, 0.005, 0.01, 0.05, 0.07, 0.1, 0.5, 1, 5, 10, 50], 'svc__kernel': ['rbf'], # # #gb 3780 "gb__learning_rate": [0.0001, 0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2], "gb__max_depth":[3,7,8,9,10,50], "gb__max_features":["log2","sqrt"], "gb__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0], "gb__n_estimators":[10, 50, 100, 200, 300], # #xgboost 'xgb__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.], 'xgb__min_child_weight': np.arange(1, 21, 5), 'xgb__subsample': np.arange(0.05, 1.01, 0.05), 'xgb__verbosity': [0], # 'xgb__booster': ['gbtree', 'gblinear' ,'dart'], # 'xgb__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.], # 'xgb__min_child_weight': range(1, 21, 5), # 'xgb__subsample': np.arange(0.05, 1.01, 0.05), # 'xgb__max_depth': [15,20,25], # 'xgb__verbosity': [0], # 'xgb__n_estimators': [100], # 'xgb__max_depth': range(1, 11), # 'xgb__learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.], # 'xgb__subsample': np.arange(0.05, 1.01, 0.05), # 'xgb__min_child_weight': range(1, 21), # 'xgb__verbosity': [0], # add this line to slient warning # 'xgb__n_estimators': [400, 700, 1000], # 'xgb__colsample_bytree': [0.7, 0.8], # 'xgb__max_depth': [15,20,25], # 'xgb__reg_alpha': [1.1, 1.2, 1.3], # 'xgb__reg_lambda': [1.1, 1.2, 1.3], # 'xgb__subsample': [0.7, 0.8, 0.9], # 'xgb__eval_metric' : ['mlogloss'] }), } scoring = {'ba': 'balanced_accuracy','ap': 'average_precision', 'F1' : 'f1', 'ra': 'roc_auc', 'rc': 'recall'} cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3) #cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5) #https://towardsdatascience.com/hyper-parameter-tuning-with-randomised-grid-search-54f865d27926 #n_iter: 30,60, 100 grid = RandomizedSearchCV( pipe, params, refit = 'ba', cv = cv, verbose = 3, n_jobs=-1, n_iter = 60, scoring= scoring, return_train_score = True ) grid.fit(X_train, y_train) df_grid=pd.DataFrame(grid.cv_results_) df_grid = df_grid.sort_values(by=['mean_test_ba'],ascending=False) df_grid = df_grid[[ 'param_clf__selected_model', 'params', 'mean_fit_time', 'std_fit_time', 'mean_test_ba', 'std_test_ba', 'rank_test_ba', 'mean_test_ap', 'std_test_ap', 'rank_test_ap', 'mean_test_ra', 'std_test_ra', 'rank_test_ra', 'mean_test_F1', 'std_test_F1', 'rank_test_F1' ]] print("Best-Fit Parameters From Training Data:\n",grid.best_params_) grid_predictions = grid.best_estimator_.predict(X_test) report = classification_report(y_test, grid_predictions, output_dict=True) report = pd.DataFrame(report).transpose() print(report) print(confusion_matrix(y_test, grid_predictions)) return grid, df_grid, report
y_test = y[test_index] #classifier = CUSBoostClassifier(**a) #classifier = AdaboostClassifier(**a) #classifier = RusBoost(depth=depth, n_estimators=estimators) #classifier = AdaboostNC_Classifier(**a) #classifier = CUSBoostNC_Classifier(**a) #classifier = RusBoost(**a) classifier = RUSBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=64) #classifier.fit(X_train, y_train, number_of_clusters, 0.5) #CUSBoost classifier #classifier.fit(X_train, y_train) #Adaboost classifier #classifier.fit(X_train, y_train, 0.5) #AdaboostNC classifier #classifier.fit(X_train, y_train, 6, 0.5) #classifier.fit(X_train, y_train, 6, fraction/100, 8) classifier.fit(X_train, y_train) predictions = classifier.predict_proba(X_test) prediction_ = classifier.predict(X_test) auc = roc_auc_score(y_test, predictions[:, 1]) f1 = f1_score(y_test, prediction_)
def test_rusboost_error(imbalanced_dataset, boosting_params, err_msg): rusboost = RUSBoostClassifier(**boosting_params) with pytest.raises(ValueError, match=err_msg): rusboost.fit(*imbalanced_dataset)
clf_results = pd.DataFrame() # define models models = { 'ExtraTrees': ExtraTreesClassifier(), 'RandomForest': RandomForestClassifier(), 'AdaBoost': AdaBoostClassifier(), 'GradientBoosting': GradientBoostingClassifier(), 'SVC': SVC(), 'LogitBoost': LogitBoost(), 'XGBClassifier': XGBClassifier(), 'ComplementNB': ComplementNB(), 'BalancedBagging': BalancedBaggingClassifier(), 'BalancedRandomForest': BalancedRandomForestClassifier(), 'RUSBoost': RUSBoostClassifier(), 'EasyEnsemble': EasyEnsembleClassifier() } # define model parameters for parameter search param_extra_trees = { 'n_estimators': [5, 10, 50, 100, 200], 'min_samples_split': [2, 4], 'max_depth': [2, 3, None], 'max_features': ['sqrt', None], 'class_weight': ['balanced'] } param_random_forest = { 'n_estimators': [5, 10, 50, 100, 200],
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True) model = GradientBoostingClassifier(n_estimators=100, random_state=42) imbl_methods = { 'eec': EasyEnsembleClassifier(random_state=42, sampling_strategy=1., n_jobs=-1, base_estimator=model), 'rub': RUSBoostClassifier(random_state=42, sampling_strategy=1., base_estimator=model) } for method in imbl_methods.keys(): imbl = imbl_methods[method] imbl.fit(X_train, y_train) y_hat_test = imbl.predict(X_test) y_hat_train = imbl.predict(X_train) print(f"Reults of {method}") print(imbl.score(X_test, y_test)) print("Train data") print(classification_report(y_train, y_hat_train)) print("Test data") print(classification_report(y_test, y_hat_test))
def classiferSet(pre_cost_weight=20): #xgt = xgb.XGBClassifier(learning_rate=0.1, scale_pos_weight=10, n_estimators=100, random_state=1) #80.77% xgt = xgb.XGBClassifier( learning_rate=0.1, #subsample=0.99, max_depth=3, scale_pos_weight=pre_cost_weight, n_estimators=80, #cv=5, #subsample=.99, random_state=27, nthread=2 #use more threads only for large dataset ) #84.62% ada = AdaBoostClassifier(n_estimators=100, learning_rate=.1, random_state=1234) #(0,130): .815 #gbt = GradientBoostingClassifier(n_estimators=100, subsample=1.0, learning_rate=1, random_state=1234) #(0,130): .830 gbt = GradientBoostingClassifier( n_estimators=100, subsample=0.99, learning_rate=.1, random_state=1234) #(0,130): .861 # rf = RandomForestClassifier( n_estimators=100, #max_depth=10, oob_score=True, class_weight={ 0: 1, 1: pre_cost_weight }, #class_weight='balanced', random_state=1234) #.846 brf = BalancedRandomForestClassifier(n_estimators=100, oob_score=True, class_weight={ 0: 1, 1: pre_cost_weight }, random_state=1234) rus = RUSBoostClassifier(n_estimators=100, random_state=1234) #https://www.kaggle.com/c/home-credit-default-risk/discussion/60921 #https://sites.google.com/view/lauraepp/parameters lgbm = lightgbm.LGBMClassifier( boosting_type='dart', #'gbdt', 'goss', 'dart' num_leaves=31, max_depth=-1, learning_rate=0.1, class_weight= None, #{0:1,1:pre_cost_weight}, using this is inferior to default random_state=1234) ourmodels = dict({ 'AdaBoost': ada, 'GradientBoost': gbt, 'RandomForest': rf, 'BalancedRandomForest': brf, 'RUSBoost': rus, 'XGBoost': xgt, 'LightGBM': lgbm }) return ourmodels
def pipe_main(pipe=None): '''pipeline construction using sklearn estimators, final step support only classifiers currently .. note:: data flows through a pipeline consisting of steps as below: raw data --> clean --> encoding --> scaling --> feature construction --> feature selection --> resampling --> final estimator see scikit-learn preprocess & estimators parameter ---- pipe - str - in the format of 'xx_xx' of which 'xx' means steps in pipeline, default None return ---- 1) pipeline instance of chosen steps 2) if pipe is None, a dict indicating possible choice of 'steps' ''' clean = { 'clean': Split_cls(dtype_filter='not_datetime', na1='null', na2=-999), 'cleanNA': Split_cls(dtype_filter='not_datetime', na1=None, na2=None), 'cleanMean': Split_cls(dtype_filter='not_datetime', na1='most_frequent', na2='mean'), } # encode = { 'woe': Woe_encoder(max_leaf_nodes=5), 'oht': Oht_encoder(), 'ordi': Ordi_encoder(), } resample = { # over_sampling 'rover': RandomOverSampler(), 'smote': SMOTE(), 'bsmote': BorderlineSMOTE(), 'adasyn': ADASYN(), # under sampling controlled methods 'runder': RandomUnderSampler(), 'nearmiss': NearMiss(version=3), 'pcart': InstanceHardnessThreshold(), # under sampling cleaning methods 'tlinks': TomekLinks(n_jobs=-1), 'oside': OneSidedSelection(n_jobs=-1), 'cleanNN': NeighbourhoodCleaningRule(n_jobs=-1), 'enn': EditedNearestNeighbours(n_jobs=-1), 'ann': AllKNN(n_jobs=-1), 'cnn': CondensedNearestNeighbour(n_jobs=-1), # clean outliers 'inlierForest': FunctionSampler(outlier_rejection, kw_args={'method': 'IsolationForest'}), 'inlierLocal': FunctionSampler(outlier_rejection, kw_args={'method': 'LocalOutlierFactor'}), 'inlierEllip': FunctionSampler(outlier_rejection, kw_args={'method': 'EllipticEnvelope'}), 'inlierOsvm': FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}), # combine 'smoteenn': SMOTEENN(), 'smotelink': SMOTETomek(), } scale = { 'stdscale': StandardScaler(), 'maxscale': MinMaxScaler(), 'rscale': RobustScaler(quantile_range=(10, 90)), 'qauntile': QuantileTransformer(), # uniform distribution 'power': PowerTransformer(), # Gaussian distribution 'norm': Normalizer(), # default L2 norm # scale sparse data 'maxabs': MaxAbsScaler(), 'stdscalesp': StandardScaler(with_mean=False), } # feature construction feature_c = { 'pca': PCA(whiten=True), 'spca': SparsePCA(normalize_components=True, n_jobs=-1), 'ipca': IncrementalPCA(whiten=True), 'kpca': KernelPCA(kernel='rbf', n_jobs=-1), 'poly': PolynomialFeatures(degree=2), 'rtembedding': RandomTreesEmbedding(n_estimators=10), 'LDA': LinearDiscriminantAnalysis(), 'QDA': QuadraticDiscriminantAnalysis(), } # select from model feature_m = { 'fwoe': SelectFromModel(Woe_encoder(max_leaf_nodes=5)), 'flog': SelectFromModel( LogisticRegressionCV(penalty='l1', solver='saga', scoring='roc_auc')), 'fsgd': SelectFromModel(SGDClassifier(penalty="l1")), 'fsvm': SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)), 'fxgb': SelectFromModel(XGBClassifier(n_jobs=-1)), 'frf': SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)), 'fRFExgb': RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20), 'fRFErf': RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5), step=0.3, n_features_to_select=20), 'fRFElog': RFE(LogisticRegressionCV(penalty='l1', solver='saga', scoring='roc_auc'), step=0.3, n_features_to_select=20) } # Univariate feature selection feature_u = { 'fchi2': GenericUnivariateSelect(chi2, 'percentile', 25), 'fMutualclf': GenericUnivariateSelect(mutual_info_classif, 'percentile', 25), 'fFclf': GenericUnivariateSelect(f_classif, 'percentile', 25), } # sklearn estimator t = all_estimators(type_filter=['classifier']) estimator = {} for i in t: try: estimator.update({i[0]: i[1]()}) except Exception: continue estimator.update( dummy=DummyClassifier(), XGBClassifier=XGBClassifier(n_jobs=-1), LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'), EasyEnsembleClassifier=EasyEnsembleClassifier(), BalancedRandomForestClassifier=BalancedRandomForestClassifier(), RUSBoostClassifier=RUSBoostClassifier(), SVC=SVC(C=0.01, gamma='auto')) if pipe is None: feature_s = {} feature_s.update(**feature_m, **feature_u) return { 'clean': clean.keys(), 'encoding': encode.keys(), 'resample': resample.keys(), 'scale': scale.keys(), 'feature_c': feature_c.keys(), 'feature_s': feature_s.keys(), 'classifier': estimator.keys() } elif isinstance(pipe, str): l = pipe.split('_') all_keys_dict = {} all_keys_dict.update(**clean, **encode, **scale, **feature_c, **feature_m, **feature_u, **estimator, **resample) steps = [] for i in l: if all_keys_dict.get(i) is not None: steps.append((i, all_keys_dict.get(i))) else: raise KeyError( "'{}' invalid key for sklearn estimators".format(i)) return Pipeline(steps) else: raise ValueError("input pipe must be a string in format 'xx[_xx]'")
# eec = EasyEnsembleClassifier(n_estimators=10, # base_estimator=base_estimator, # n_jobs=-1) # eec.fit(X_train_seek, y_train_seek) # y_pred_eec = eec.predict(X_test_seek) # print('Easy ensemble classifier performance:') # print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' # .format(balanced_accuracy_score(y_test_seek, y_pred_eec), # geometric_mean_score(y_test_seek, y_pred_eec))) # cm_eec = confusion_matrix(y_test_seek, y_pred_eec) # fig, ax = plt.subplots(ncols=2) # plot_confusion_matrix(cm_eec, classes=np.unique(dataset.target), ax=ax[0], # title='Easy ensemble classifier') base_estimator = AdaBoostClassifier(n_estimators=10) rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('RUSBoost classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format( balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost))) cm_rusboost = confusion_matrix(y_test, y_pred_rusboost) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_rusboost, classes=np.unique(dataset.target), ax=ax[1], title='RUSBoost classifier') rusboost.fit(X_train_seek, y_train_seek)
exLpred.append(float(lineE[j])) #cellTypesTrue.append(lineE[int(len(lineE))-1]) exMpred.append(exLpred) #s.append("\n") exLpred = [] cellID.append(lineE[0]) #cellTypesTrue = np.array(cellTypesTrue) exMpred = np.array(exMpred) cellID = np.array(cellID) ################################### ##### Everything is ready for cell type prediction ##### rusboost = RUSBoostClassifier(random_state=0) rusboost.fit(exMtrain, cellTypesTrain) ##### Cell types prediction ##### cellTypesPred = rusboost.predict(exMpred) #accuracy_score = balanced_accuracy_score(cellTypesTrue, cellTypesPred) #print accuracy_score #classification_report(cellTypesTrue, cellTypesPred) ##### Checking performance ##### #confusionMatrix = confusion_matrix(cellTypesTrue, cellTypesPred) cellTypesProbs = rusboost.predict_proba(exMpred) #print confusionMatrix ##### Merging the cell types and probability score #####