def test_rusboost(imbalanced_dataset, algorithm): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1) classes = np.unique(y) n_estimators = 500 rusboost = RUSBoostClassifier(n_estimators=n_estimators, algorithm=algorithm, random_state=0) rusboost.fit(X_train, y_train) assert_array_equal(classes, rusboost.classes_) # check that we have an ensemble of samplers and estimators with a # consistent size assert len(rusboost.estimators_) > 1 assert len(rusboost.estimators_) == len(rusboost.samplers_) assert len(rusboost.pipelines_) == len(rusboost.samplers_) # each sampler in the ensemble should have different random state assert (len({sampler.random_state for sampler in rusboost.samplers_ }) == len(rusboost.samplers_)) # each estimator in the ensemble should have different random state assert (len({est.random_state for est in rusboost.estimators_ }) == len(rusboost.estimators_)) # check the consistency of the feature importances assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1] # check the consistency of the prediction outpus y_pred = rusboost.predict_proba(X_test) assert y_pred.shape[1] == len(classes) assert rusboost.decision_function(X_test).shape[1] == len(classes) score = rusboost.score(X_test, y_test) assert score > 0.7, "Failed with algorithm {} and score {}".format( algorithm, score) y_pred = rusboost.predict(X_test) assert y_pred.shape == y_test.shape
def test_rusboost(imbalanced_dataset, algorithm): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) classes = np.unique(y) n_estimators = 500 rusboost = RUSBoostClassifier(n_estimators=n_estimators, algorithm=algorithm, random_state=0) rusboost.fit(X_train, y_train) assert_array_equal(classes, rusboost.classes_) # check that we have an ensemble of samplers and estimators with a # consistent size assert len(rusboost.estimators_) > 1 assert len(rusboost.estimators_) == len(rusboost.samplers_) assert len(rusboost.pipelines_) == len(rusboost.samplers_) # each sampler in the ensemble should have different random state assert (len(set(sampler.random_state for sampler in rusboost.samplers_)) == len(rusboost.samplers_)) # each estimator in the ensemble should have different random state assert (len(set(est.random_state for est in rusboost.estimators_)) == len(rusboost.estimators_)) # check the consistency of the feature importances assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1] # check the consistency of the prediction outpus y_pred = rusboost.predict_proba(X_test) assert y_pred.shape[1] == len(classes) assert rusboost.decision_function(X_test).shape[1] == len(classes) score = rusboost.score(X_test, y_test) assert score > 0.7, "Failed with algorithm {} and score {}".format( algorithm, score) y_pred = rusboost.predict(X_test) assert y_pred.shape == y_test.shape
#classifier = RusBoost(depth=depth, n_estimators=estimators) #classifier = AdaboostNC_Classifier(**a) #classifier = CUSBoostNC_Classifier(**a) #classifier = RusBoost(**a) classifier = RUSBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=64) #classifier.fit(X_train, y_train, number_of_clusters, 0.5) #CUSBoost classifier #classifier.fit(X_train, y_train) #Adaboost classifier #classifier.fit(X_train, y_train, 0.5) #AdaboostNC classifier #classifier.fit(X_train, y_train, 6, 0.5) #classifier.fit(X_train, y_train, 6, fraction/100, 8) classifier.fit(X_train, y_train) predictions = classifier.predict_proba(X_test) prediction_ = classifier.predict(X_test) auc = roc_auc_score(y_test, predictions[:, 1]) f1 = f1_score(y_test, prediction_) accuracy = accuracy_score(y_test, prediction_) #aupr = average_precision_score(y_test, predictions[:, 1]) current_param_auc.append(auc) current_param_f1.append(f1) current_param_accuracy.append(accuracy) #current_param_aupr.append(aupr) #fpr, tpr, thresholds = roc_curve(y_test, predictions[:, 1])
def learning_model(year, class_weight): iters = 300 gap = 2 year_test = year data_test = reader.ordinary_data_reader('uscecchini28.csv', year_test, year_test) x_test = data_test.features y_test = data_test.labels test = np.c_[data_test.years, data_test.firms] ''' an if-else is used to judge whether the class_weight is None to prevent Exception from string concatenation a try-except for RusBoost with DecisionTreeClassifier using custom class_weight if we can find the right model trained last time on disk, we can directly use that model to predict the result without training twice otherwise, we have to train that model and save it on disk ''' # if class_weight is not None: # we use current_model_name to find/save the trained model with custom class_weight # current_model_name = class_weight + "_" + str(year_test) + ".m" # else: # current_model_name = str(year_test) + ".m" current_model_name = class_weight + "_" + str(year_test) + ".m" try: rusboost_model = joblib.load(current_model_name) except Exception as e: print('Running RUSBoost (training period: 1991-' + str(year_test - gap) + ', testing period: ' + str( year_test) + ', with ' + str(gap) + '-year gap)...') data_train = reader.ordinary_data_reader('uscecchini28.csv', 1991, year_test - gap) x_train = data_train.features y_train = data_train.labels newpaaer_train = data_train.newpaaers # formatter labels and newpaaers for the step: data_test.newpaaers(data_test.labels~=0) data_test.newpaaers = np.array(data_test.newpaaers) data_test.labels = np.array(data_test.labels) # replace the nan that should be remained in the array with 0 for i in range(len(data_test.newpaaers)): if np.isnan(data_test.newpaaers[i]): if data_test.labels[i] != 0: data_test.newpaaers[i] = 0 # replace all the nans remain in the array data_test.newpaaers = np.array([x for x in data_test.newpaaers if str(x) != 'nan']) # replace all the 0 back to nan for i in range(len(data_test.newpaaers)): if int(data_test.newpaaers[i]) == 0.0: data_test.newpaaers[i] = np.NaN # do the unique to get final result for newpaaer_test newpaaer_test = np.unique(data_test.newpaaers) ''' Caution: here we change the type of variable called y_train for matching the array index of formatted array newpaaer_train in the following loop ''' y_train = np.array(y_train) num_frauds = sum(y_train == 1) print(num_frauds) ''' here we use the function in1d to replace the function ismember used in matlab and a temp array for the other operation to handle serial frauds finish the step: y_train[ismember(newpaaer_train, newpaaer_test)] = 0 ''' temp_array = np.array(np.in1d(newpaaer_train, newpaaer_test)).astype(int) for i in range(len(temp_array)): if temp_array[i] == 1: y_train[i] = 0 # delete the temp array del temp_array num_frauds = num_frauds - sum(y_train == 1) print('Recode', num_frauds, 'overlapped frauds (i.e., change fraud label from 1 to 0).') start_time = time.perf_counter() rusboost_model = RUSBoostClassifier(DecisionTreeClassifier(min_samples_leaf=5, class_weight=class_weight), learning_rate=0.1, n_estimators=iters) rusboost_model.fit(x_train, y_train) end_time = time.perf_counter() t_train = end_time - start_time joblib.dump(rusboost_model, current_model_name) print(end_time - start_time) print('Training time: %.3f seconds' % t_train) start_time = time.perf_counter() predit = rusboost_model.predict(x_test) prob = rusboost_model.predict_proba(x_test) end_time = time.perf_counter() t_test = end_time - start_time print('Testing time %.3f seconds' % t_test) # test figures print("AUC: %.4f" % metrics.roc_auc_score(y_test, predit)) # np.set_printoptions(precision=4, threshold=8, edgeitems=4, linewidth=75, suppress=True, nanstr='nan', infstr='inf') print("precision: %.2f%%" % np.multiply(metrics.precision_score(y_test, predit, zero_division=0), 100)) print("recall: %.2f%%" % np.multiply(metrics.recall_score(y_test, predit), 100)) # dump part of the results(fraud probability) prob = np.around(np.delete(prob, 0, axis=1) * 100, decimals=5) data = np.c_[predit, prob] data = np.c_[test, data] file_data = pd.DataFrame(data) csv_file_name = 'data.csv' file_data.to_csv(csv_file_name, header=False, index=False)
##### Everything is ready for cell type prediction ##### rusboost = RUSBoostClassifier(random_state=0) rusboost.fit(exMtrain, cellTypesTrain) ##### Cell types prediction ##### cellTypesPred = rusboost.predict(exMpred) #accuracy_score = balanced_accuracy_score(cellTypesTrue, cellTypesPred) #print accuracy_score #classification_report(cellTypesTrue, cellTypesPred) ##### Checking performance ##### #confusionMatrix = confusion_matrix(cellTypesTrue, cellTypesPred) cellTypesProbs = rusboost.predict_proba(exMpred) #print confusionMatrix ##### Merging the cell types and probability score ##### cellID_Probs = np.concatenate((cellID[:, None], cellTypesProbs), axis=1) combine = np.concatenate((cellID_Probs, cellTypesPred[:, None]), axis=1) ################################### ##### Output the results from array ##### # file format: # Cell_ID Cell_types_prediction Cell_types_prediction_probability_score # ##### Prediction complete, and generate the output file ##### outFile = open(sys.argv[3], 'w') #The name of output file